breadlicker45 commited on
Commit
fcbe832
·
1 Parent(s): 542eb73

Upload 26 files

Browse files
README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chat with Meta's LLaMA models at home made easy
2
+
3
+ This repository is a chat example with [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) ([arXiv](https://arxiv.org/abs/2302.13971v1)) models running on a typical home PC. You will just need a NVIDIA videocard and some RAM to chat with model.
4
+
5
+ This repo is heavily based on Meta's original repo: https://github.com/facebookresearch/llama
6
+
7
+ And on Venuatu's repo: https://github.com/venuatu/llama
8
+
9
+ ### Examples of chats here
10
+
11
+ https://github.com/facebookresearch/llama/issues/162
12
+
13
+ ### System requirements
14
+ - Modern enough CPU
15
+ - NVIDIA graphics card
16
+ - 64 or better 128 Gb of RAM (192 or 256 would be perfect)
17
+
18
+ One may run with 32 Gb of RAM, but inference will be slow (with the speed of your swap file reading)
19
+
20
+ I am running this on 12700k/128 Gb RAM/NVIDIA 3070ti 8Gb/fast huge nvme and getting one token from 30B model in a few seconds.
21
+
22
+ For example, 30B model uses around 70 Gb of RAM.
23
+
24
+ If you do not have powerful videocard, you may use another repo for cpu-only inference: https://github.com/randaller/llama-cpu
25
+
26
+ ### Conda Environment Setup Example for Windows 10+
27
+ Download and install Anaconda Python https://www.anaconda.com and run Anaconda Prompt
28
+ ```
29
+ conda create -n llama python=3.10
30
+ conda activate llama
31
+ conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
32
+ ```
33
+
34
+ ### Setup
35
+ In a conda env with pytorch / cuda available, run
36
+ ```
37
+ pip install -r requirements.txt
38
+ ```
39
+ Then in this repository
40
+ ```
41
+ pip install -e .
42
+ ```
43
+
44
+ ### Download tokenizer and models
45
+ magnet:?xt=urn:btih:ZXXDAUWYLRUXXBHUYEMS6Q5CE5WA3LVA&dn=LLaMA
46
+
47
+ or
48
+
49
+ magnet:xt=urn:btih:b8287ebfa04f879b048d4d4404108cf3e8014352&dn=LLaMA&tr=udp%3a%2f%2ftracker.opentrackr.org%3a1337%2fannounce
50
+
51
+ ### Prepare model
52
+
53
+ First, you need to unshard model checkpoints to a single file. Let's do this for 30B model.
54
+
55
+ ```
56
+ python merge-weights.py --input_dir D:\Downloads\LLaMA --model_size 30B
57
+ ```
58
+
59
+ In this example, D:\Downloads\LLaMA is a root folder of downloaded torrent with weights.
60
+
61
+ This will create merged.pth file in the root folder of this repo.
62
+
63
+ Place this file and corresponding (torrentroot)/30B/params.json of model into [/model] folder.
64
+
65
+ So you should end up with two files in [/model] folder: merged.pth and params.json.
66
+
67
+ Place (torrentroot)/tokenizer.model file to the [/tokenizer] folder of this repo. Now you are ready to go.
68
+
69
+ ### Run the chat
70
+
71
+ ```
72
+ python example-chat.py ./model ./tokenizer/tokenizer.model
73
+ ```
74
+
75
+ ### Enable multi-line answers
76
+
77
+ If you wish to stop generation not by "\n" sign, but by another signature, like "User:" (which is also good idea), or any other, make the following modification in the llama/generation.py:
78
+
79
+ ![image](https://user-images.githubusercontent.com/22396871/224122767-227deda4-a718-4774-a7f9-786c07d379cf.png)
80
+
81
+ -5 means to remove last 5 chars from resulting context, which is length of your stop signature, "User:" in this example.
example-chat.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from typing import Tuple
5
+ import os
6
+ import sys
7
+ import torch
8
+ import fire
9
+ import time
10
+ import json
11
+ import pyarrow as pa
12
+
13
+ from pathlib import Path
14
+
15
+ from llama import ModelArgs, Transformer, Tokenizer, LLaMA
16
+
17
+
18
+ def load(
19
+ ckpt_dir: str,
20
+ tokenizer_path: str,
21
+ max_seq_len: int,
22
+ max_batch_size: int,
23
+ ) -> LLaMA:
24
+ start_time = time.time()
25
+ arrow_dir = Path(ckpt_dir).expanduser() / 'arrow'
26
+
27
+ if not arrow_dir.exists():
28
+ print('Converting checkpoints to arrow format')
29
+ checkpoints = sorted(Path(ckpt_dir).expanduser().glob("*.pth"))
30
+ for ckpt_file in checkpoints:
31
+ print(ckpt_file)
32
+ index = ckpt_file.parts[-1].split('.')[-2]
33
+
34
+ ckpt = torch.load(ckpt_file, map_location='cuda')
35
+ (arrow_dir / index).mkdir(parents=True, exist_ok=True)
36
+ for k, v in ckpt.items():
37
+ tens = pa.Tensor.from_numpy(v.numpy())
38
+ with pa.output_stream(arrow_dir / index / k) as f:
39
+ pa.ipc.write_tensor(tens, f)
40
+ ckpt = None
41
+
42
+ with open(Path(ckpt_dir) / "params.json", "r") as f:
43
+ params = json.loads(f.read())
44
+
45
+ print("Loading checkpoint")
46
+ segments = sorted((arrow_dir / '00').glob("*"))
47
+
48
+ checkpoint = {}
49
+ files = []
50
+ for seg in segments:
51
+ f = pa.memory_map(str(seg))
52
+ files.append(f)
53
+ t = pa.ipc.read_tensor(f).to_numpy()
54
+ t = torch.from_numpy(t)
55
+ checkpoint[seg.parts[-1]] = t
56
+
57
+ # torch.set_default_tensor_type(torch.cuda.HalfTensor)
58
+ torch.set_default_tensor_type(torch.BFloat16Tensor)
59
+ # torch.set_default_tensor_type(torch.FloatTensor)
60
+
61
+ model_args: ModelArgs = ModelArgs(
62
+ max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
63
+ )
64
+ print("Loading tokenizer")
65
+ tokenizer = Tokenizer(model_path=tokenizer_path)
66
+ model_args.vocab_size = tokenizer.n_words
67
+ print("Loading model")
68
+ model = Transformer(model_args)
69
+
70
+ checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
71
+ model.load_state_dict(torch.load(checkpoints[-1]), strict=False)
72
+
73
+ for f in files:
74
+ f.close()
75
+ files = None
76
+
77
+ generator = LLaMA(model, tokenizer)
78
+ print(f"Loaded in {time.time() - start_time:.2f} seconds")
79
+ return generator
80
+
81
+
82
+ def main(
83
+ ckpt_dir: str,
84
+ tokenizer_path: str,
85
+ temperature: float = 0.8,
86
+ top_p: float = 0.95,
87
+ max_seq_len: int = 2048,
88
+ max_batch_size: int = 1, # 16 for 13B, 4 for 30B and 65B, 2 for 1024 seq_len for 30B
89
+ ):
90
+ generator = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
91
+
92
+ ctx = """A dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, and knows its own limits.
93
+ User: Hello, AI.
94
+ AI: Hello! How can I assist you today?
95
+ """
96
+
97
+ while True:
98
+ prompt = input(f'User: ')
99
+ if ctx != "":
100
+ ctx = ctx + "User: " + prompt + "\n"
101
+ else:
102
+ ctx = prompt + "\n"
103
+
104
+ ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
105
+
106
+ if len(ctx.strip()) > 0:
107
+ prompts = [ctx]
108
+ results = generator.generate(
109
+ prompts, max_gen_len=2048, temperature=temperature, top_p=top_p
110
+ )
111
+ ctx = results[0]
112
+
113
+
114
+ if __name__ == "__main__":
115
+ fire.Fire(main)
example-chat2.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from typing import Tuple
5
+ import os
6
+ import sys
7
+ import torch
8
+ import fire
9
+ import time
10
+ import json
11
+ from pathlib import Path
12
+ from llama import ModelArgs, Transformer, Tokenizer, LLaMA
13
+
14
+
15
+ def load(
16
+ ckpt_dir: str,
17
+ tokenizer_path: str,
18
+ max_seq_len: int,
19
+ max_batch_size: int,
20
+ ) -> LLaMA:
21
+ print("Creating model...")
22
+ start_time = time.time()
23
+ checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
24
+
25
+ with open(Path(ckpt_dir) / "params.json", "r") as f:
26
+ params = json.loads(f.read())
27
+
28
+ model_args: ModelArgs = ModelArgs(
29
+ max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
30
+ )
31
+
32
+ tokenizer = Tokenizer(model_path=tokenizer_path)
33
+ model_args.vocab_size = tokenizer.n_words
34
+
35
+ model = Transformer(model_args)
36
+ model.to("cpu")
37
+
38
+ print("Loading merged checkpoint...")
39
+ checkpoint = torch.load(checkpoints[-1], map_location="cuda")
40
+ model.load_state_dict(checkpoint, strict=False)
41
+ del checkpoint
42
+
43
+ generator = LLaMA(model, tokenizer)
44
+ print(f"Loaded model in {time.time() - start_time:.2f} seconds")
45
+ return generator
46
+
47
+
48
+ def main(
49
+ ckpt_dir: str = './model',
50
+ tokenizer_path: str = './tokenizer/tokenizer.model',
51
+ temperature: float = 0.8,
52
+ top_p: float = 0.95,
53
+ max_seq_len: int = 256, # up to 2048
54
+ max_batch_size: int = 5,
55
+ ):
56
+ # torch.manual_seed(1)
57
+ torch.set_default_dtype(torch.bfloat16)
58
+
59
+ generator = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
60
+
61
+ while True:
62
+ prompt = input(f'prompt> ')
63
+ if len(prompt.strip()) > 0:
64
+ prompts = [prompt]
65
+ results = generator.generate(
66
+ prompts, max_gen_len=256, temperature=temperature, top_p=top_p
67
+ )
68
+
69
+ for result in results:
70
+ print(result)
71
+
72
+
73
+ if __name__ == "__main__":
74
+ fire.Fire(main)
example-cpu.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from typing import Tuple
5
+ import os
6
+ import sys
7
+ import torch
8
+ import fire
9
+ import time
10
+ import json
11
+ from pathlib import Path
12
+ from llama import ModelArgs, Transformer, Tokenizer, LLaMA
13
+
14
+
15
+ def load(
16
+ ckpt_dir: str,
17
+ tokenizer_path: str,
18
+ max_seq_len: int,
19
+ max_batch_size: int,
20
+ ) -> LLaMA:
21
+ print("Creating model...")
22
+ start_time = time.time()
23
+ checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
24
+
25
+ with open(Path(ckpt_dir) / "params.json", "r") as f:
26
+ params = json.loads(f.read())
27
+
28
+ model_args: ModelArgs = ModelArgs(
29
+ max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
30
+ )
31
+
32
+ tokenizer = Tokenizer(model_path=tokenizer_path)
33
+ model_args.vocab_size = tokenizer.n_words
34
+
35
+ model = Transformer(model_args)
36
+
37
+ # Original copyright by tloen
38
+ # https://github.com/tloen/llama-int8/blob/main/example.py
39
+ key_to_dim = {
40
+ "w1": 0,
41
+ "w2": -1,
42
+ "w3": 0,
43
+ "wo": -1,
44
+ "wq": 0,
45
+ "wk": 0,
46
+ "wv": 0,
47
+ "output": 0,
48
+ "tok_embeddings": -1,
49
+ "ffn_norm": None,
50
+ "attention_norm": None,
51
+ "norm": None,
52
+ "rope": None,
53
+ }
54
+
55
+ for i, ckpt in enumerate(checkpoints):
56
+ print(f"Loading checkpoint {i}")
57
+ checkpoint = torch.load(ckpt, map_location="cpu")
58
+ for parameter_name, parameter in model.named_parameters():
59
+ short_name = parameter_name.split(".")[-2]
60
+ if key_to_dim[short_name] is None and i == 0:
61
+ parameter.data = checkpoint[parameter_name]
62
+ elif key_to_dim[short_name] == 0:
63
+ size = checkpoint[parameter_name].size(0)
64
+ parameter.data[size * i: size * (i + 1), :] = checkpoint[
65
+ parameter_name
66
+ ]
67
+ elif key_to_dim[short_name] == -1:
68
+ size = checkpoint[parameter_name].size(-1)
69
+ parameter.data[:, size * i: size * (i + 1)] = checkpoint[
70
+ parameter_name
71
+ ]
72
+ del checkpoint[parameter_name]
73
+ del checkpoint
74
+
75
+ model.to("cpu")
76
+
77
+ generator = LLaMA(model, tokenizer)
78
+ print(f"Loaded model in {time.time() - start_time:.2f} seconds")
79
+ return generator
80
+
81
+
82
+ def main(
83
+ ckpt_dir: str = './model',
84
+ tokenizer_path: str = './tokenizer/tokenizer.model',
85
+ temperature: float = 0.8,
86
+ top_p: float = 0.95,
87
+ max_seq_len: int = 512, # up to 2048
88
+ max_batch_size: int = 32,
89
+ ):
90
+ # torch.manual_seed(1)
91
+ # torch.set_default_dtype(torch.bfloat16)
92
+
93
+ generator = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
94
+
95
+ prompts = [
96
+ ##### For these prompts, the expected answer is the natural continuation of the prompt #####
97
+
98
+ "I believe the meaning of life is",
99
+ # "Simply put, the theory of relativity states that ",
100
+ # "Building a website can be done in 10 simple steps:\n",
101
+
102
+ ##### Few shot prompts: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api #####
103
+
104
+ # """Tweet: "I hate it when my phone battery dies."
105
+ # Sentiment: Negative
106
+ # ###
107
+ # Tweet: "My day has been 👍"
108
+ # Sentiment: Positive
109
+ # ###
110
+ # Tweet: "This is the link to the article"
111
+ # Sentiment: Neutral
112
+ # ###
113
+ # Tweet: "This new music video was incredibile"
114
+ # Sentiment:""",
115
+
116
+ # """Translate English to French:
117
+ # sea otter => loutre de mer
118
+ # peppermint => menthe poivrée
119
+ # plush girafe => girafe peluche
120
+ # cheese =>""",
121
+ ]
122
+
123
+ results = generator.generate(
124
+ prompts, max_gen_len=256, temperature=temperature, top_p=top_p
125
+ )
126
+
127
+ for result in results:
128
+ print(result)
129
+ print("\n==================================\n")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ fire.Fire(main)
llama.egg-info/PKG-INFO ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 1.0
2
+ Name: llama
3
+ Version: 0.0.0
4
+ Summary: UNKNOWN
5
+ Home-page: UNKNOWN
6
+ Author: UNKNOWN
7
+ Author-email: UNKNOWN
8
+ License: UNKNOWN
9
+ Description: UNKNOWN
10
+ Platform: UNKNOWN
llama.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ setup.py
3
+ llama/__init__.py
4
+ llama/generation.py
5
+ llama/model.py
6
+ llama/tokenizer.py
7
+ llama.egg-info/PKG-INFO
8
+ llama.egg-info/SOURCES.txt
9
+ llama.egg-info/dependency_links.txt
10
+ llama.egg-info/top_level.txt
llama.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
llama.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ llama
llama/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from .generation import LLaMA
5
+ from .model import ModelArgs, Transformer
6
+ from .tokenizer import Tokenizer
llama/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (299 Bytes). View file
 
llama/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (293 Bytes). View file
 
llama/__pycache__/generation.cpython-310.pyc ADDED
Binary file (3.02 kB). View file
 
llama/__pycache__/generation.cpython-37.pyc ADDED
Binary file (3.02 kB). View file
 
llama/__pycache__/model.cpython-310.pyc ADDED
Binary file (8.24 kB). View file
 
llama/__pycache__/model.cpython-37.pyc ADDED
Binary file (8.13 kB). View file
 
llama/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (1.44 kB). View file
 
llama/__pycache__/tokenizer.cpython-37.pyc ADDED
Binary file (1.44 kB). View file
 
llama/generation.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from typing import List
5
+
6
+ import torch
7
+ import traceback
8
+
9
+ from llama.tokenizer import Tokenizer
10
+ from llama.model import Transformer
11
+ from tqdm import trange
12
+
13
+
14
+ class LLaMA:
15
+ def __init__(self, model: Transformer, tokenizer: Tokenizer):
16
+ self.model = model
17
+ self.tokenizer = tokenizer
18
+
19
+ def generate(
20
+ self,
21
+ prompts: List[str],
22
+ max_gen_len: int,
23
+ temperature: float = 0.8,
24
+ top_p: float = 0.95,
25
+ ) -> List[str]:
26
+ bsz = len(prompts)
27
+ params = self.model.params
28
+ assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
29
+
30
+ count_newlines = prompts[0].count("\n")
31
+
32
+ prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
33
+
34
+ min_prompt_size = min([len(t) for t in prompt_tokens])
35
+ max_prompt_size = max([len(t) for t in prompt_tokens])
36
+
37
+ total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
38
+
39
+ tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).long()
40
+ for k, t in enumerate(prompt_tokens):
41
+ tokens[k, : len(t)] = torch.tensor(t).long()
42
+ tokens[k, -1] = self.tokenizer.eos_id
43
+ input_text_mask = tokens != self.tokenizer.pad_id
44
+ start_pos = min_prompt_size
45
+ prev_pos = 0
46
+ decoded = [None] * bsz
47
+ for cur_pos in trange(start_pos, total_len, desc="forward"):
48
+ logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
49
+ if temperature > 0:
50
+ probs = torch.softmax(logits / temperature, dim=-1)
51
+ next_token = sample_top_p(probs, top_p)
52
+ else:
53
+ next_token = torch.argmax(logits, dim=-1)
54
+ next_token = next_token.reshape(-1).cpu()
55
+ # only replace token if prompt has already been generated
56
+ next_token = torch.where(
57
+ input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
58
+ )
59
+ tokens[:, cur_pos] = next_token
60
+ prev_pos = cur_pos
61
+
62
+ print("-" * 30)
63
+ for i, t in enumerate(tokens.tolist()):
64
+ # i = cur_pos
65
+ # t = next_token
66
+ # cut to max gen len
67
+ # t = t[: len(pr-ompt_tokens[i]) + max_gen_len]
68
+ t = t[: min(cur_pos, len(prompt_tokens[i]) + max_gen_len)]
69
+ # cut to eos tok if any
70
+ try:
71
+ t = t[: t.index(self.tokenizer.eos_id)]
72
+ except ValueError:
73
+ pass # traceback.print_exc()
74
+ try:
75
+ d = self.tokenizer.decode(t)
76
+ print([i] * 20)
77
+ print(d)
78
+ decoded[i] = d
79
+
80
+ result_count_newlines = d.count("\n")
81
+ if result_count_newlines > count_newlines:
82
+ return decoded
83
+
84
+ except IndexError:
85
+ traceback.print_exc()
86
+ print(t)
87
+ print("-" * 30)
88
+ return decoded
89
+
90
+
91
+ def sample_top_p(probs, p):
92
+ probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
93
+ probs_sum = torch.cumsum(probs_sort, dim=-1)
94
+ mask = probs_sum - probs_sort > p
95
+ probs_sort[mask] = 0.0
96
+ probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
97
+ next_token = torch.multinomial(probs_sort, num_samples=1)
98
+ next_token = torch.gather(probs_idx, -1, next_token)
99
+ return next_token
llama/model.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from typing import Optional, Tuple
5
+ from dataclasses import dataclass
6
+ import math
7
+
8
+ import torch
9
+ from torch import nn
10
+ import torch.nn.functional as F
11
+ from torch.nn.utils import skip_init
12
+
13
+ from tqdm import tqdm
14
+
15
+ @dataclass
16
+ class ModelArgs:
17
+ dim: int = 512
18
+ n_layers: int = 8
19
+ n_heads: int = 8
20
+ vocab_size: int = -1 # defined later by tokenizer
21
+ multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
22
+ norm_eps: float = 1e-5
23
+
24
+ max_batch_size: int = 32
25
+ max_seq_len: int = 1024
26
+
27
+
28
+ class RMSNorm(torch.nn.Module):
29
+ def __init__(self, dim: int, eps: float = 1e-6):
30
+ super().__init__()
31
+ self.eps = eps
32
+ self.weight = nn.Parameter(torch.ones(dim))
33
+
34
+ def _norm(self, x):
35
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
36
+
37
+ def forward(self, x):
38
+ output = self._norm(x.float()).type_as(x)
39
+ return output * self.weight
40
+
41
+
42
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
43
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
44
+ t = torch.arange(end, device=freqs.device) # type: ignore
45
+ freqs = torch.outer(t, freqs).float() # type: ignore
46
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64
47
+ return freqs_cis
48
+
49
+
50
+ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
51
+ ndim = x.ndim
52
+ assert 0 <= 1 < ndim
53
+ assert freqs_cis.shape == (x.shape[1], x.shape[-1])
54
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
55
+ return freqs_cis.view(*shape)
56
+
57
+
58
+ def apply_rotary_emb(
59
+ xq: torch.Tensor,
60
+ xk: torch.Tensor,
61
+ freqs_cis: torch.Tensor,
62
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
63
+ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
64
+ xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
65
+ freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
66
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
67
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
68
+ return xq_out.type_as(xq), xk_out.type_as(xk)
69
+
70
+
71
+ class Attention(nn.Module):
72
+ def __init__(self, args: ModelArgs):
73
+ super().__init__()
74
+
75
+ self.n_local_heads = args.n_heads # // fs_init.get_model_parallel_world_size()
76
+ self.head_dim = args.dim // args.n_heads
77
+
78
+ self.wq = skip_init(nn.Linear,
79
+ args.dim,
80
+ args.n_heads * self.head_dim,
81
+ bias=False,
82
+ )
83
+ self.wk = skip_init(nn.Linear,
84
+ args.dim,
85
+ args.n_heads * self.head_dim,
86
+ bias=False,
87
+ )
88
+ self.wv = skip_init(nn.Linear,
89
+ args.dim,
90
+ args.n_heads * self.head_dim,
91
+ bias=False,
92
+ )
93
+ self.wo = skip_init(nn.Linear,
94
+ args.n_heads * self.head_dim,
95
+ args.dim,
96
+ bias=False,
97
+ )
98
+
99
+ self.cache_k = torch.zeros(
100
+ (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
101
+ ).cuda()
102
+ self.cache_v = torch.zeros(
103
+ (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
104
+ ).cuda()
105
+
106
+ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
107
+ bsz, seqlen, _ = x.shape
108
+ xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
109
+
110
+ xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
111
+ xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
112
+ xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
113
+
114
+ xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
115
+
116
+ self.cache_k = self.cache_k.to(xq)
117
+ self.cache_v = self.cache_v.to(xq)
118
+
119
+ self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
120
+ self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
121
+
122
+ keys = self.cache_k[:bsz, : start_pos + seqlen]
123
+ values = self.cache_v[:bsz, : start_pos + seqlen]
124
+
125
+ xq = xq.transpose(1, 2)
126
+ keys = keys.transpose(1, 2)
127
+ values = values.transpose(1, 2)
128
+ scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
129
+ if mask is not None:
130
+ scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen)
131
+ scores = F.softmax(scores.float(), dim=-1).type_as(xq)
132
+ output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim)
133
+ output = output.transpose(
134
+ 1, 2
135
+ ).contiguous().view(bsz, seqlen, -1)
136
+
137
+ return self.wo(output)
138
+
139
+
140
+ class FeedForward(nn.Module):
141
+ def __init__(
142
+ self,
143
+ dim: int,
144
+ hidden_dim: int,
145
+ multiple_of: int,
146
+ ):
147
+ super().__init__()
148
+ hidden_dim = int(2 * hidden_dim / 3)
149
+ hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
150
+
151
+ self.w1 = skip_init(nn.Linear,
152
+ dim,
153
+ hidden_dim,
154
+ bias=False,
155
+ )
156
+ self.w2 = skip_init(nn.Linear,
157
+ hidden_dim,
158
+ dim,
159
+ bias=False,
160
+ )
161
+ self.w3 = skip_init(nn.Linear,
162
+ dim,
163
+ hidden_dim,
164
+ bias=False,
165
+ )
166
+
167
+ def forward(self, x):
168
+ return self.w2(F.silu(self.w1(x)) * self.w3(x))
169
+
170
+
171
+ class TransformerBlock(nn.Module):
172
+ def __init__(self, layer_id: int, args: ModelArgs):
173
+ super().__init__()
174
+ self.n_heads = args.n_heads
175
+ self.dim = args.dim
176
+ self.head_dim = args.dim // args.n_heads
177
+ self.attention = Attention(args)
178
+ self.feed_forward = FeedForward(
179
+ dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
180
+ )
181
+ self.layer_id = layer_id
182
+ self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
183
+ self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
184
+
185
+ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
186
+ h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
187
+ out = h + self.feed_forward.forward(self.ffn_norm(h))
188
+ return out
189
+
190
+ # https://github.com/gmorenz/llama/commit/4daf7f1a2f2bb22208b5d464bc2a18511d54408d
191
+ def move_parameters_to_gpu(module):
192
+ if not hasattr(module, "saved"):
193
+ module.saved = module._parameters.copy()
194
+ for k, param in module.saved.items():
195
+ if param is not None:
196
+ module._parameters[k] = param.to("cuda", non_blocking=True)
197
+ for child in module.children():
198
+ move_parameters_to_gpu(child)
199
+
200
+ def move_parameters_to_cpu(module):
201
+ for k, param in module.saved.items():
202
+ del module._parameters[k]
203
+ module._parameters[k] = param
204
+ for child in module.children():
205
+ move_parameters_to_cpu(child)
206
+
207
+
208
+ class Transformer(nn.Module):
209
+ def __init__(self, params: ModelArgs):
210
+ super().__init__()
211
+ self.params = params
212
+ self.vocab_size = params.vocab_size
213
+ self.n_layers = params.n_layers
214
+
215
+ self.tok_embeddings = skip_init(nn.Embedding,
216
+ params.vocab_size,
217
+ params.dim,
218
+ )
219
+
220
+ self.layers = torch.nn.ModuleList()
221
+ for layer_id in range(params.n_layers):
222
+ self.layers.append(TransformerBlock(layer_id, params))
223
+
224
+ self.layer_locations = [None] * len(self.layers)
225
+
226
+ self.norm = RMSNorm(params.dim, eps=params.norm_eps).cuda()
227
+ self.output = skip_init(nn.Linear,
228
+ params.dim,
229
+ params.vocab_size,
230
+ bias=False,
231
+ ).cuda()
232
+
233
+ self.freqs_cis = precompute_freqs_cis(
234
+ self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
235
+ ).cuda()
236
+
237
+ @torch.inference_mode()
238
+ def forward(self, tokens: torch.Tensor, start_pos: int):
239
+ use_gpu = True # start_pos == 0
240
+
241
+ _bsz, seqlen = tokens.shape
242
+ h = self.tok_embeddings(tokens)
243
+ self.freqs_cis = self.freqs_cis
244
+ freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
245
+ if use_gpu:
246
+ h = h.cuda()
247
+
248
+ mask = None
249
+ if seqlen > 1:
250
+ mask = torch.full(
251
+ (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device
252
+ )
253
+ mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
254
+
255
+ if use_gpu and mask is not None:
256
+ mask = mask.cuda()
257
+
258
+ for layer in tqdm(self.layers, desc="flayers", leave=True):
259
+ if use_gpu:
260
+ move_parameters_to_gpu(layer)
261
+ h = layer(h, start_pos, freqs_cis, mask)
262
+ if use_gpu:
263
+ move_parameters_to_cpu(layer)
264
+
265
+ h = self.norm(h)
266
+ if use_gpu:
267
+ del mask
268
+ torch.cuda.empty_cache()
269
+ output = self.output(h[:, -1, :]) # only compute last logits
270
+ return output.float()
llama/tokenizer.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from sentencepiece import SentencePieceProcessor
5
+ from logging import getLogger
6
+ from typing import List
7
+ import os
8
+
9
+
10
+ logger = getLogger()
11
+
12
+
13
+ class Tokenizer:
14
+ def __init__(self, model_path: str):
15
+ # reload tokenizer
16
+ assert os.path.isfile(model_path), model_path
17
+ self.sp_model = SentencePieceProcessor(model_file=model_path)
18
+ logger.info(f"Reloaded SentencePiece model from {model_path}")
19
+
20
+ # BOS / EOS token IDs
21
+ self.n_words: int = self.sp_model.vocab_size()
22
+ self.bos_id: int = self.sp_model.bos_id()
23
+ self.eos_id: int = self.sp_model.eos_id()
24
+ self.pad_id: int = self.sp_model.pad_id()
25
+ logger.info(
26
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
27
+ )
28
+ assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
29
+
30
+ def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
31
+ assert type(s) is str
32
+ t = self.sp_model.encode(s)
33
+ if bos:
34
+ t = [self.bos_id] + t
35
+ if eos:
36
+ t = t + [self.eos_id]
37
+ return t
38
+
39
+ def decode(self, t: List[int]) -> str:
40
+ return self.sp_model.decode(t)
merge-weights.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Original copyright by Jason Phang
2
+ # https://github.com/zphang
3
+ # Taken here
4
+ # https://github.com/huggingface/transformers/pull/21955/commits/8978f28e6c44b083c0b190d3931902c2904c940a#diff-110a445233a8b15a0875998eeaf75cb8607b38a5daa736291dd058766879bbdd
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import shutil
10
+ import torch
11
+
12
+ """
13
+ Sample usage:
14
+ ```
15
+ python merge_weights.py --input_dir D:\Downloads\LLaMA --model_size 13B
16
+ ```
17
+ """
18
+
19
+ INTERMEDIATE_SIZE_MAP = {
20
+ "7B": 11008,
21
+ "13B": 13824,
22
+ "30B": 17920,
23
+ "65B": 22016,
24
+ }
25
+
26
+ NUM_SHARDS = {
27
+ "7B": 1,
28
+ "13B": 2,
29
+ "30B": 4,
30
+ "65B": 8,
31
+ }
32
+
33
+
34
+ def read_json(path):
35
+ with open(path, "r") as f:
36
+ return json.loads(f.read())
37
+
38
+
39
+ def write_model(input_base_path, model_size):
40
+ assert model_size in INTERMEDIATE_SIZE_MAP
41
+
42
+ params = read_json(os.path.join(input_base_path, "params.json"))
43
+ num_shards = NUM_SHARDS[model_size]
44
+ n_layers = params["n_layers"]
45
+ n_heads = params["n_heads"]
46
+ n_heads_per_shard = n_heads // num_shards
47
+ dim = params["dim"]
48
+ dims_per_head = dim // n_heads
49
+
50
+ # Load weights
51
+ if model_size == "7B":
52
+ loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cuda")
53
+ else:
54
+ loaded = [
55
+ torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cuda")
56
+ for i in range(num_shards)
57
+ ]
58
+
59
+ state_dict = {}
60
+
61
+ for layer_i in range(n_layers):
62
+ if model_size == "7B":
63
+ state_dict |= {
64
+ f"layers.{layer_i}.attention.wq.weight": loaded[
65
+ f"layers.{layer_i}.attention.wq.weight"
66
+ ],
67
+ f"layers.{layer_i}.attention.wk.weight": loaded[
68
+ f"layers.{layer_i}.attention.wk.weight"
69
+ ],
70
+ f"layers.{layer_i}.attention.wv.weight": loaded[
71
+ f"layers.{layer_i}.attention.wv.weight"
72
+ ],
73
+ f"layers.{layer_i}.attention.wo.weight": loaded[
74
+ f"layers.{layer_i}.attention.wo.weight"
75
+ ],
76
+ f"layers.{layer_i}.feed_forward.w1.weight": loaded[
77
+ f"layers.{layer_i}.feed_forward.w1.weight"
78
+ ],
79
+ f"layers.{layer_i}.feed_forward.w2.weight": loaded[
80
+ f"layers.{layer_i}.feed_forward.w2.weight"
81
+ ],
82
+ f"layers.{layer_i}.feed_forward.w3.weight": loaded[
83
+ f"layers.{layer_i}.feed_forward.w3.weight"
84
+ ],
85
+ f"layers.{layer_i}.attention_norm.weight": loaded[
86
+ f"layers.{layer_i}.attention_norm.weight"
87
+ ],
88
+ f"layers.{layer_i}.ffn_norm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
89
+ }
90
+ else:
91
+ state_dict |= {
92
+ f"layers.{layer_i}.attention_norm.weight": loaded[0][
93
+ f"layers.{layer_i}.attention_norm.weight"
94
+ ],
95
+ f"layers.{layer_i}.ffn_norm.weight": loaded[0][f"layers.{layer_i}.ffn_norm.weight"],
96
+ }
97
+ state_dict[f"layers.{layer_i}.attention.wq.weight"] = torch.cat(
98
+ [
99
+ loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
100
+ for i in range(num_shards)
101
+ ],
102
+ dim=0,
103
+ ).reshape(dim, dim)
104
+ state_dict[f"layers.{layer_i}.attention.wk.weight"] = torch.cat(
105
+ [
106
+ loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
107
+ for i in range(num_shards)
108
+ ],
109
+ dim=0,
110
+ ).reshape(dim, dim)
111
+ state_dict[f"layers.{layer_i}.attention.wv.weight"] = torch.cat(
112
+ [
113
+ loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
114
+ for i in range(num_shards)
115
+ ],
116
+ dim=0,
117
+ ).reshape(dim, dim)
118
+ state_dict[f"layers.{layer_i}.attention.wo.weight"] = torch.cat(
119
+ [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
120
+ )
121
+ state_dict[f"layers.{layer_i}.feed_forward.w1.weight"] = torch.cat(
122
+ [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
123
+ )
124
+ state_dict[f"layers.{layer_i}.feed_forward.w2.weight"] = torch.cat(
125
+ [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
126
+ )
127
+ state_dict[f"layers.{layer_i}.feed_forward.w3.weight"] = torch.cat(
128
+ [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
129
+ )
130
+
131
+ if model_size == "7B":
132
+ state_dict |= {
133
+ "tok_embeddings.weight": loaded["tok_embeddings.weight"],
134
+ "norm.weight": loaded["norm.weight"],
135
+ "output.weight": loaded["output.weight"],
136
+ }
137
+ else:
138
+ state_dict |= {
139
+ "norm.weight": loaded[0]["norm.weight"],
140
+ "tok_embeddings.weight": torch.cat(
141
+ [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
142
+ ),
143
+ "output.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
144
+ }
145
+
146
+ torch.save(state_dict, 'merged.pth')
147
+
148
+
149
+ def main():
150
+ parser = argparse.ArgumentParser()
151
+ parser.add_argument(
152
+ "--input_dir",
153
+ help="Location of LLaMA weights, which contains tokenizer.model and model folders",
154
+ )
155
+ parser.add_argument(
156
+ "--model_size",
157
+ choices=["7B", "13B", "30B", "65B"],
158
+ )
159
+ args = parser.parse_args()
160
+
161
+ write_model(
162
+ input_base_path=os.path.join(args.input_dir, args.model_size),
163
+ model_size=args.model_size,
164
+ )
165
+
166
+
167
+ if __name__ == "__main__":
168
+ main()
python3.10 ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fairscale
2
+ fire
3
+ sentencepiece
4
+ tqdm
5
+ pyarrow
setup.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed according to the terms of the GNU General Public License version 3.
3
+
4
+ from setuptools import setup, find_packages
5
+
6
+ setup(name="llama", version="0.0.0", packages=find_packages())
tokenizer/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+
tokenizer/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723