Commit
·
fcbe832
1
Parent(s):
542eb73
Upload 26 files
Browse files- README.md +81 -0
- example-chat.py +115 -0
- example-chat2.py +74 -0
- example-cpu.py +133 -0
- llama.egg-info/PKG-INFO +10 -0
- llama.egg-info/SOURCES.txt +10 -0
- llama.egg-info/dependency_links.txt +1 -0
- llama.egg-info/top_level.txt +1 -0
- llama/__init__.py +6 -0
- llama/__pycache__/__init__.cpython-310.pyc +0 -0
- llama/__pycache__/__init__.cpython-37.pyc +0 -0
- llama/__pycache__/generation.cpython-310.pyc +0 -0
- llama/__pycache__/generation.cpython-37.pyc +0 -0
- llama/__pycache__/model.cpython-310.pyc +0 -0
- llama/__pycache__/model.cpython-37.pyc +0 -0
- llama/__pycache__/tokenizer.cpython-310.pyc +0 -0
- llama/__pycache__/tokenizer.cpython-37.pyc +0 -0
- llama/generation.py +99 -0
- llama/model.py +270 -0
- llama/tokenizer.py +40 -0
- merge-weights.py +168 -0
- python3.10 +0 -0
- requirements.txt +5 -0
- setup.py +6 -0
- tokenizer/.gitignore +1 -0
- tokenizer/tokenizer.model +3 -0
README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Chat with Meta's LLaMA models at home made easy
|
| 2 |
+
|
| 3 |
+
This repository is a chat example with [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) ([arXiv](https://arxiv.org/abs/2302.13971v1)) models running on a typical home PC. You will just need a NVIDIA videocard and some RAM to chat with model.
|
| 4 |
+
|
| 5 |
+
This repo is heavily based on Meta's original repo: https://github.com/facebookresearch/llama
|
| 6 |
+
|
| 7 |
+
And on Venuatu's repo: https://github.com/venuatu/llama
|
| 8 |
+
|
| 9 |
+
### Examples of chats here
|
| 10 |
+
|
| 11 |
+
https://github.com/facebookresearch/llama/issues/162
|
| 12 |
+
|
| 13 |
+
### System requirements
|
| 14 |
+
- Modern enough CPU
|
| 15 |
+
- NVIDIA graphics card
|
| 16 |
+
- 64 or better 128 Gb of RAM (192 or 256 would be perfect)
|
| 17 |
+
|
| 18 |
+
One may run with 32 Gb of RAM, but inference will be slow (with the speed of your swap file reading)
|
| 19 |
+
|
| 20 |
+
I am running this on 12700k/128 Gb RAM/NVIDIA 3070ti 8Gb/fast huge nvme and getting one token from 30B model in a few seconds.
|
| 21 |
+
|
| 22 |
+
For example, 30B model uses around 70 Gb of RAM.
|
| 23 |
+
|
| 24 |
+
If you do not have powerful videocard, you may use another repo for cpu-only inference: https://github.com/randaller/llama-cpu
|
| 25 |
+
|
| 26 |
+
### Conda Environment Setup Example for Windows 10+
|
| 27 |
+
Download and install Anaconda Python https://www.anaconda.com and run Anaconda Prompt
|
| 28 |
+
```
|
| 29 |
+
conda create -n llama python=3.10
|
| 30 |
+
conda activate llama
|
| 31 |
+
conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
### Setup
|
| 35 |
+
In a conda env with pytorch / cuda available, run
|
| 36 |
+
```
|
| 37 |
+
pip install -r requirements.txt
|
| 38 |
+
```
|
| 39 |
+
Then in this repository
|
| 40 |
+
```
|
| 41 |
+
pip install -e .
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
### Download tokenizer and models
|
| 45 |
+
magnet:?xt=urn:btih:ZXXDAUWYLRUXXBHUYEMS6Q5CE5WA3LVA&dn=LLaMA
|
| 46 |
+
|
| 47 |
+
or
|
| 48 |
+
|
| 49 |
+
magnet:xt=urn:btih:b8287ebfa04f879b048d4d4404108cf3e8014352&dn=LLaMA&tr=udp%3a%2f%2ftracker.opentrackr.org%3a1337%2fannounce
|
| 50 |
+
|
| 51 |
+
### Prepare model
|
| 52 |
+
|
| 53 |
+
First, you need to unshard model checkpoints to a single file. Let's do this for 30B model.
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
python merge-weights.py --input_dir D:\Downloads\LLaMA --model_size 30B
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
In this example, D:\Downloads\LLaMA is a root folder of downloaded torrent with weights.
|
| 60 |
+
|
| 61 |
+
This will create merged.pth file in the root folder of this repo.
|
| 62 |
+
|
| 63 |
+
Place this file and corresponding (torrentroot)/30B/params.json of model into [/model] folder.
|
| 64 |
+
|
| 65 |
+
So you should end up with two files in [/model] folder: merged.pth and params.json.
|
| 66 |
+
|
| 67 |
+
Place (torrentroot)/tokenizer.model file to the [/tokenizer] folder of this repo. Now you are ready to go.
|
| 68 |
+
|
| 69 |
+
### Run the chat
|
| 70 |
+
|
| 71 |
+
```
|
| 72 |
+
python example-chat.py ./model ./tokenizer/tokenizer.model
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### Enable multi-line answers
|
| 76 |
+
|
| 77 |
+
If you wish to stop generation not by "\n" sign, but by another signature, like "User:" (which is also good idea), or any other, make the following modification in the llama/generation.py:
|
| 78 |
+
|
| 79 |
+

|
| 80 |
+
|
| 81 |
+
-5 means to remove last 5 chars from resulting context, which is length of your stop signature, "User:" in this example.
|
example-chat.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import torch
|
| 8 |
+
import fire
|
| 9 |
+
import time
|
| 10 |
+
import json
|
| 11 |
+
import pyarrow as pa
|
| 12 |
+
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from llama import ModelArgs, Transformer, Tokenizer, LLaMA
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load(
|
| 19 |
+
ckpt_dir: str,
|
| 20 |
+
tokenizer_path: str,
|
| 21 |
+
max_seq_len: int,
|
| 22 |
+
max_batch_size: int,
|
| 23 |
+
) -> LLaMA:
|
| 24 |
+
start_time = time.time()
|
| 25 |
+
arrow_dir = Path(ckpt_dir).expanduser() / 'arrow'
|
| 26 |
+
|
| 27 |
+
if not arrow_dir.exists():
|
| 28 |
+
print('Converting checkpoints to arrow format')
|
| 29 |
+
checkpoints = sorted(Path(ckpt_dir).expanduser().glob("*.pth"))
|
| 30 |
+
for ckpt_file in checkpoints:
|
| 31 |
+
print(ckpt_file)
|
| 32 |
+
index = ckpt_file.parts[-1].split('.')[-2]
|
| 33 |
+
|
| 34 |
+
ckpt = torch.load(ckpt_file, map_location='cuda')
|
| 35 |
+
(arrow_dir / index).mkdir(parents=True, exist_ok=True)
|
| 36 |
+
for k, v in ckpt.items():
|
| 37 |
+
tens = pa.Tensor.from_numpy(v.numpy())
|
| 38 |
+
with pa.output_stream(arrow_dir / index / k) as f:
|
| 39 |
+
pa.ipc.write_tensor(tens, f)
|
| 40 |
+
ckpt = None
|
| 41 |
+
|
| 42 |
+
with open(Path(ckpt_dir) / "params.json", "r") as f:
|
| 43 |
+
params = json.loads(f.read())
|
| 44 |
+
|
| 45 |
+
print("Loading checkpoint")
|
| 46 |
+
segments = sorted((arrow_dir / '00').glob("*"))
|
| 47 |
+
|
| 48 |
+
checkpoint = {}
|
| 49 |
+
files = []
|
| 50 |
+
for seg in segments:
|
| 51 |
+
f = pa.memory_map(str(seg))
|
| 52 |
+
files.append(f)
|
| 53 |
+
t = pa.ipc.read_tensor(f).to_numpy()
|
| 54 |
+
t = torch.from_numpy(t)
|
| 55 |
+
checkpoint[seg.parts[-1]] = t
|
| 56 |
+
|
| 57 |
+
# torch.set_default_tensor_type(torch.cuda.HalfTensor)
|
| 58 |
+
torch.set_default_tensor_type(torch.BFloat16Tensor)
|
| 59 |
+
# torch.set_default_tensor_type(torch.FloatTensor)
|
| 60 |
+
|
| 61 |
+
model_args: ModelArgs = ModelArgs(
|
| 62 |
+
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
|
| 63 |
+
)
|
| 64 |
+
print("Loading tokenizer")
|
| 65 |
+
tokenizer = Tokenizer(model_path=tokenizer_path)
|
| 66 |
+
model_args.vocab_size = tokenizer.n_words
|
| 67 |
+
print("Loading model")
|
| 68 |
+
model = Transformer(model_args)
|
| 69 |
+
|
| 70 |
+
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
|
| 71 |
+
model.load_state_dict(torch.load(checkpoints[-1]), strict=False)
|
| 72 |
+
|
| 73 |
+
for f in files:
|
| 74 |
+
f.close()
|
| 75 |
+
files = None
|
| 76 |
+
|
| 77 |
+
generator = LLaMA(model, tokenizer)
|
| 78 |
+
print(f"Loaded in {time.time() - start_time:.2f} seconds")
|
| 79 |
+
return generator
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def main(
|
| 83 |
+
ckpt_dir: str,
|
| 84 |
+
tokenizer_path: str,
|
| 85 |
+
temperature: float = 0.8,
|
| 86 |
+
top_p: float = 0.95,
|
| 87 |
+
max_seq_len: int = 2048,
|
| 88 |
+
max_batch_size: int = 1, # 16 for 13B, 4 for 30B and 65B, 2 for 1024 seq_len for 30B
|
| 89 |
+
):
|
| 90 |
+
generator = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
|
| 91 |
+
|
| 92 |
+
ctx = """A dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, and knows its own limits.
|
| 93 |
+
User: Hello, AI.
|
| 94 |
+
AI: Hello! How can I assist you today?
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
while True:
|
| 98 |
+
prompt = input(f'User: ')
|
| 99 |
+
if ctx != "":
|
| 100 |
+
ctx = ctx + "User: " + prompt + "\n"
|
| 101 |
+
else:
|
| 102 |
+
ctx = prompt + "\n"
|
| 103 |
+
|
| 104 |
+
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
| 105 |
+
|
| 106 |
+
if len(ctx.strip()) > 0:
|
| 107 |
+
prompts = [ctx]
|
| 108 |
+
results = generator.generate(
|
| 109 |
+
prompts, max_gen_len=2048, temperature=temperature, top_p=top_p
|
| 110 |
+
)
|
| 111 |
+
ctx = results[0]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
fire.Fire(main)
|
example-chat2.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import torch
|
| 8 |
+
import fire
|
| 9 |
+
import time
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from llama import ModelArgs, Transformer, Tokenizer, LLaMA
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load(
|
| 16 |
+
ckpt_dir: str,
|
| 17 |
+
tokenizer_path: str,
|
| 18 |
+
max_seq_len: int,
|
| 19 |
+
max_batch_size: int,
|
| 20 |
+
) -> LLaMA:
|
| 21 |
+
print("Creating model...")
|
| 22 |
+
start_time = time.time()
|
| 23 |
+
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
|
| 24 |
+
|
| 25 |
+
with open(Path(ckpt_dir) / "params.json", "r") as f:
|
| 26 |
+
params = json.loads(f.read())
|
| 27 |
+
|
| 28 |
+
model_args: ModelArgs = ModelArgs(
|
| 29 |
+
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
tokenizer = Tokenizer(model_path=tokenizer_path)
|
| 33 |
+
model_args.vocab_size = tokenizer.n_words
|
| 34 |
+
|
| 35 |
+
model = Transformer(model_args)
|
| 36 |
+
model.to("cpu")
|
| 37 |
+
|
| 38 |
+
print("Loading merged checkpoint...")
|
| 39 |
+
checkpoint = torch.load(checkpoints[-1], map_location="cuda")
|
| 40 |
+
model.load_state_dict(checkpoint, strict=False)
|
| 41 |
+
del checkpoint
|
| 42 |
+
|
| 43 |
+
generator = LLaMA(model, tokenizer)
|
| 44 |
+
print(f"Loaded model in {time.time() - start_time:.2f} seconds")
|
| 45 |
+
return generator
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def main(
|
| 49 |
+
ckpt_dir: str = './model',
|
| 50 |
+
tokenizer_path: str = './tokenizer/tokenizer.model',
|
| 51 |
+
temperature: float = 0.8,
|
| 52 |
+
top_p: float = 0.95,
|
| 53 |
+
max_seq_len: int = 256, # up to 2048
|
| 54 |
+
max_batch_size: int = 5,
|
| 55 |
+
):
|
| 56 |
+
# torch.manual_seed(1)
|
| 57 |
+
torch.set_default_dtype(torch.bfloat16)
|
| 58 |
+
|
| 59 |
+
generator = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
|
| 60 |
+
|
| 61 |
+
while True:
|
| 62 |
+
prompt = input(f'prompt> ')
|
| 63 |
+
if len(prompt.strip()) > 0:
|
| 64 |
+
prompts = [prompt]
|
| 65 |
+
results = generator.generate(
|
| 66 |
+
prompts, max_gen_len=256, temperature=temperature, top_p=top_p
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
for result in results:
|
| 70 |
+
print(result)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
fire.Fire(main)
|
example-cpu.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import torch
|
| 8 |
+
import fire
|
| 9 |
+
import time
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from llama import ModelArgs, Transformer, Tokenizer, LLaMA
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load(
|
| 16 |
+
ckpt_dir: str,
|
| 17 |
+
tokenizer_path: str,
|
| 18 |
+
max_seq_len: int,
|
| 19 |
+
max_batch_size: int,
|
| 20 |
+
) -> LLaMA:
|
| 21 |
+
print("Creating model...")
|
| 22 |
+
start_time = time.time()
|
| 23 |
+
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
|
| 24 |
+
|
| 25 |
+
with open(Path(ckpt_dir) / "params.json", "r") as f:
|
| 26 |
+
params = json.loads(f.read())
|
| 27 |
+
|
| 28 |
+
model_args: ModelArgs = ModelArgs(
|
| 29 |
+
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
tokenizer = Tokenizer(model_path=tokenizer_path)
|
| 33 |
+
model_args.vocab_size = tokenizer.n_words
|
| 34 |
+
|
| 35 |
+
model = Transformer(model_args)
|
| 36 |
+
|
| 37 |
+
# Original copyright by tloen
|
| 38 |
+
# https://github.com/tloen/llama-int8/blob/main/example.py
|
| 39 |
+
key_to_dim = {
|
| 40 |
+
"w1": 0,
|
| 41 |
+
"w2": -1,
|
| 42 |
+
"w3": 0,
|
| 43 |
+
"wo": -1,
|
| 44 |
+
"wq": 0,
|
| 45 |
+
"wk": 0,
|
| 46 |
+
"wv": 0,
|
| 47 |
+
"output": 0,
|
| 48 |
+
"tok_embeddings": -1,
|
| 49 |
+
"ffn_norm": None,
|
| 50 |
+
"attention_norm": None,
|
| 51 |
+
"norm": None,
|
| 52 |
+
"rope": None,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
for i, ckpt in enumerate(checkpoints):
|
| 56 |
+
print(f"Loading checkpoint {i}")
|
| 57 |
+
checkpoint = torch.load(ckpt, map_location="cpu")
|
| 58 |
+
for parameter_name, parameter in model.named_parameters():
|
| 59 |
+
short_name = parameter_name.split(".")[-2]
|
| 60 |
+
if key_to_dim[short_name] is None and i == 0:
|
| 61 |
+
parameter.data = checkpoint[parameter_name]
|
| 62 |
+
elif key_to_dim[short_name] == 0:
|
| 63 |
+
size = checkpoint[parameter_name].size(0)
|
| 64 |
+
parameter.data[size * i: size * (i + 1), :] = checkpoint[
|
| 65 |
+
parameter_name
|
| 66 |
+
]
|
| 67 |
+
elif key_to_dim[short_name] == -1:
|
| 68 |
+
size = checkpoint[parameter_name].size(-1)
|
| 69 |
+
parameter.data[:, size * i: size * (i + 1)] = checkpoint[
|
| 70 |
+
parameter_name
|
| 71 |
+
]
|
| 72 |
+
del checkpoint[parameter_name]
|
| 73 |
+
del checkpoint
|
| 74 |
+
|
| 75 |
+
model.to("cpu")
|
| 76 |
+
|
| 77 |
+
generator = LLaMA(model, tokenizer)
|
| 78 |
+
print(f"Loaded model in {time.time() - start_time:.2f} seconds")
|
| 79 |
+
return generator
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def main(
|
| 83 |
+
ckpt_dir: str = './model',
|
| 84 |
+
tokenizer_path: str = './tokenizer/tokenizer.model',
|
| 85 |
+
temperature: float = 0.8,
|
| 86 |
+
top_p: float = 0.95,
|
| 87 |
+
max_seq_len: int = 512, # up to 2048
|
| 88 |
+
max_batch_size: int = 32,
|
| 89 |
+
):
|
| 90 |
+
# torch.manual_seed(1)
|
| 91 |
+
# torch.set_default_dtype(torch.bfloat16)
|
| 92 |
+
|
| 93 |
+
generator = load(ckpt_dir, tokenizer_path, max_seq_len, max_batch_size)
|
| 94 |
+
|
| 95 |
+
prompts = [
|
| 96 |
+
##### For these prompts, the expected answer is the natural continuation of the prompt #####
|
| 97 |
+
|
| 98 |
+
"I believe the meaning of life is",
|
| 99 |
+
# "Simply put, the theory of relativity states that ",
|
| 100 |
+
# "Building a website can be done in 10 simple steps:\n",
|
| 101 |
+
|
| 102 |
+
##### Few shot prompts: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api #####
|
| 103 |
+
|
| 104 |
+
# """Tweet: "I hate it when my phone battery dies."
|
| 105 |
+
# Sentiment: Negative
|
| 106 |
+
# ###
|
| 107 |
+
# Tweet: "My day has been 👍"
|
| 108 |
+
# Sentiment: Positive
|
| 109 |
+
# ###
|
| 110 |
+
# Tweet: "This is the link to the article"
|
| 111 |
+
# Sentiment: Neutral
|
| 112 |
+
# ###
|
| 113 |
+
# Tweet: "This new music video was incredibile"
|
| 114 |
+
# Sentiment:""",
|
| 115 |
+
|
| 116 |
+
# """Translate English to French:
|
| 117 |
+
# sea otter => loutre de mer
|
| 118 |
+
# peppermint => menthe poivrée
|
| 119 |
+
# plush girafe => girafe peluche
|
| 120 |
+
# cheese =>""",
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
results = generator.generate(
|
| 124 |
+
prompts, max_gen_len=256, temperature=temperature, top_p=top_p
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
for result in results:
|
| 128 |
+
print(result)
|
| 129 |
+
print("\n==================================\n")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
fire.Fire(main)
|
llama.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 1.0
|
| 2 |
+
Name: llama
|
| 3 |
+
Version: 0.0.0
|
| 4 |
+
Summary: UNKNOWN
|
| 5 |
+
Home-page: UNKNOWN
|
| 6 |
+
Author: UNKNOWN
|
| 7 |
+
Author-email: UNKNOWN
|
| 8 |
+
License: UNKNOWN
|
| 9 |
+
Description: UNKNOWN
|
| 10 |
+
Platform: UNKNOWN
|
llama.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
setup.py
|
| 3 |
+
llama/__init__.py
|
| 4 |
+
llama/generation.py
|
| 5 |
+
llama/model.py
|
| 6 |
+
llama/tokenizer.py
|
| 7 |
+
llama.egg-info/PKG-INFO
|
| 8 |
+
llama.egg-info/SOURCES.txt
|
| 9 |
+
llama.egg-info/dependency_links.txt
|
| 10 |
+
llama.egg-info/top_level.txt
|
llama.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
llama.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
llama
|
llama/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from .generation import LLaMA
|
| 5 |
+
from .model import ModelArgs, Transformer
|
| 6 |
+
from .tokenizer import Tokenizer
|
llama/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (299 Bytes). View file
|
|
|
llama/__pycache__/__init__.cpython-37.pyc
ADDED
|
Binary file (293 Bytes). View file
|
|
|
llama/__pycache__/generation.cpython-310.pyc
ADDED
|
Binary file (3.02 kB). View file
|
|
|
llama/__pycache__/generation.cpython-37.pyc
ADDED
|
Binary file (3.02 kB). View file
|
|
|
llama/__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (8.24 kB). View file
|
|
|
llama/__pycache__/model.cpython-37.pyc
ADDED
|
Binary file (8.13 kB). View file
|
|
|
llama/__pycache__/tokenizer.cpython-310.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
llama/__pycache__/tokenizer.cpython-37.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
llama/generation.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import traceback
|
| 8 |
+
|
| 9 |
+
from llama.tokenizer import Tokenizer
|
| 10 |
+
from llama.model import Transformer
|
| 11 |
+
from tqdm import trange
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class LLaMA:
|
| 15 |
+
def __init__(self, model: Transformer, tokenizer: Tokenizer):
|
| 16 |
+
self.model = model
|
| 17 |
+
self.tokenizer = tokenizer
|
| 18 |
+
|
| 19 |
+
def generate(
|
| 20 |
+
self,
|
| 21 |
+
prompts: List[str],
|
| 22 |
+
max_gen_len: int,
|
| 23 |
+
temperature: float = 0.8,
|
| 24 |
+
top_p: float = 0.95,
|
| 25 |
+
) -> List[str]:
|
| 26 |
+
bsz = len(prompts)
|
| 27 |
+
params = self.model.params
|
| 28 |
+
assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
|
| 29 |
+
|
| 30 |
+
count_newlines = prompts[0].count("\n")
|
| 31 |
+
|
| 32 |
+
prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
|
| 33 |
+
|
| 34 |
+
min_prompt_size = min([len(t) for t in prompt_tokens])
|
| 35 |
+
max_prompt_size = max([len(t) for t in prompt_tokens])
|
| 36 |
+
|
| 37 |
+
total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
|
| 38 |
+
|
| 39 |
+
tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).long()
|
| 40 |
+
for k, t in enumerate(prompt_tokens):
|
| 41 |
+
tokens[k, : len(t)] = torch.tensor(t).long()
|
| 42 |
+
tokens[k, -1] = self.tokenizer.eos_id
|
| 43 |
+
input_text_mask = tokens != self.tokenizer.pad_id
|
| 44 |
+
start_pos = min_prompt_size
|
| 45 |
+
prev_pos = 0
|
| 46 |
+
decoded = [None] * bsz
|
| 47 |
+
for cur_pos in trange(start_pos, total_len, desc="forward"):
|
| 48 |
+
logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
|
| 49 |
+
if temperature > 0:
|
| 50 |
+
probs = torch.softmax(logits / temperature, dim=-1)
|
| 51 |
+
next_token = sample_top_p(probs, top_p)
|
| 52 |
+
else:
|
| 53 |
+
next_token = torch.argmax(logits, dim=-1)
|
| 54 |
+
next_token = next_token.reshape(-1).cpu()
|
| 55 |
+
# only replace token if prompt has already been generated
|
| 56 |
+
next_token = torch.where(
|
| 57 |
+
input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
|
| 58 |
+
)
|
| 59 |
+
tokens[:, cur_pos] = next_token
|
| 60 |
+
prev_pos = cur_pos
|
| 61 |
+
|
| 62 |
+
print("-" * 30)
|
| 63 |
+
for i, t in enumerate(tokens.tolist()):
|
| 64 |
+
# i = cur_pos
|
| 65 |
+
# t = next_token
|
| 66 |
+
# cut to max gen len
|
| 67 |
+
# t = t[: len(pr-ompt_tokens[i]) + max_gen_len]
|
| 68 |
+
t = t[: min(cur_pos, len(prompt_tokens[i]) + max_gen_len)]
|
| 69 |
+
# cut to eos tok if any
|
| 70 |
+
try:
|
| 71 |
+
t = t[: t.index(self.tokenizer.eos_id)]
|
| 72 |
+
except ValueError:
|
| 73 |
+
pass # traceback.print_exc()
|
| 74 |
+
try:
|
| 75 |
+
d = self.tokenizer.decode(t)
|
| 76 |
+
print([i] * 20)
|
| 77 |
+
print(d)
|
| 78 |
+
decoded[i] = d
|
| 79 |
+
|
| 80 |
+
result_count_newlines = d.count("\n")
|
| 81 |
+
if result_count_newlines > count_newlines:
|
| 82 |
+
return decoded
|
| 83 |
+
|
| 84 |
+
except IndexError:
|
| 85 |
+
traceback.print_exc()
|
| 86 |
+
print(t)
|
| 87 |
+
print("-" * 30)
|
| 88 |
+
return decoded
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def sample_top_p(probs, p):
|
| 92 |
+
probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
|
| 93 |
+
probs_sum = torch.cumsum(probs_sort, dim=-1)
|
| 94 |
+
mask = probs_sum - probs_sort > p
|
| 95 |
+
probs_sort[mask] = 0.0
|
| 96 |
+
probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
|
| 97 |
+
next_token = torch.multinomial(probs_sort, num_samples=1)
|
| 98 |
+
next_token = torch.gather(probs_idx, -1, next_token)
|
| 99 |
+
return next_token
|
llama/model.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from typing import Optional, Tuple
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
from torch import nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
from torch.nn.utils import skip_init
|
| 12 |
+
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class ModelArgs:
|
| 17 |
+
dim: int = 512
|
| 18 |
+
n_layers: int = 8
|
| 19 |
+
n_heads: int = 8
|
| 20 |
+
vocab_size: int = -1 # defined later by tokenizer
|
| 21 |
+
multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
|
| 22 |
+
norm_eps: float = 1e-5
|
| 23 |
+
|
| 24 |
+
max_batch_size: int = 32
|
| 25 |
+
max_seq_len: int = 1024
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RMSNorm(torch.nn.Module):
|
| 29 |
+
def __init__(self, dim: int, eps: float = 1e-6):
|
| 30 |
+
super().__init__()
|
| 31 |
+
self.eps = eps
|
| 32 |
+
self.weight = nn.Parameter(torch.ones(dim))
|
| 33 |
+
|
| 34 |
+
def _norm(self, x):
|
| 35 |
+
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
|
| 36 |
+
|
| 37 |
+
def forward(self, x):
|
| 38 |
+
output = self._norm(x.float()).type_as(x)
|
| 39 |
+
return output * self.weight
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
|
| 43 |
+
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
|
| 44 |
+
t = torch.arange(end, device=freqs.device) # type: ignore
|
| 45 |
+
freqs = torch.outer(t, freqs).float() # type: ignore
|
| 46 |
+
freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64
|
| 47 |
+
return freqs_cis
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
|
| 51 |
+
ndim = x.ndim
|
| 52 |
+
assert 0 <= 1 < ndim
|
| 53 |
+
assert freqs_cis.shape == (x.shape[1], x.shape[-1])
|
| 54 |
+
shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
|
| 55 |
+
return freqs_cis.view(*shape)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def apply_rotary_emb(
|
| 59 |
+
xq: torch.Tensor,
|
| 60 |
+
xk: torch.Tensor,
|
| 61 |
+
freqs_cis: torch.Tensor,
|
| 62 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 63 |
+
xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
|
| 64 |
+
xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
|
| 65 |
+
freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
|
| 66 |
+
xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
|
| 67 |
+
xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
|
| 68 |
+
return xq_out.type_as(xq), xk_out.type_as(xk)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class Attention(nn.Module):
|
| 72 |
+
def __init__(self, args: ModelArgs):
|
| 73 |
+
super().__init__()
|
| 74 |
+
|
| 75 |
+
self.n_local_heads = args.n_heads # // fs_init.get_model_parallel_world_size()
|
| 76 |
+
self.head_dim = args.dim // args.n_heads
|
| 77 |
+
|
| 78 |
+
self.wq = skip_init(nn.Linear,
|
| 79 |
+
args.dim,
|
| 80 |
+
args.n_heads * self.head_dim,
|
| 81 |
+
bias=False,
|
| 82 |
+
)
|
| 83 |
+
self.wk = skip_init(nn.Linear,
|
| 84 |
+
args.dim,
|
| 85 |
+
args.n_heads * self.head_dim,
|
| 86 |
+
bias=False,
|
| 87 |
+
)
|
| 88 |
+
self.wv = skip_init(nn.Linear,
|
| 89 |
+
args.dim,
|
| 90 |
+
args.n_heads * self.head_dim,
|
| 91 |
+
bias=False,
|
| 92 |
+
)
|
| 93 |
+
self.wo = skip_init(nn.Linear,
|
| 94 |
+
args.n_heads * self.head_dim,
|
| 95 |
+
args.dim,
|
| 96 |
+
bias=False,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
self.cache_k = torch.zeros(
|
| 100 |
+
(args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
|
| 101 |
+
).cuda()
|
| 102 |
+
self.cache_v = torch.zeros(
|
| 103 |
+
(args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
|
| 104 |
+
).cuda()
|
| 105 |
+
|
| 106 |
+
def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
|
| 107 |
+
bsz, seqlen, _ = x.shape
|
| 108 |
+
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
|
| 109 |
+
|
| 110 |
+
xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
|
| 111 |
+
xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
|
| 112 |
+
xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
|
| 113 |
+
|
| 114 |
+
xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
|
| 115 |
+
|
| 116 |
+
self.cache_k = self.cache_k.to(xq)
|
| 117 |
+
self.cache_v = self.cache_v.to(xq)
|
| 118 |
+
|
| 119 |
+
self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
|
| 120 |
+
self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
|
| 121 |
+
|
| 122 |
+
keys = self.cache_k[:bsz, : start_pos + seqlen]
|
| 123 |
+
values = self.cache_v[:bsz, : start_pos + seqlen]
|
| 124 |
+
|
| 125 |
+
xq = xq.transpose(1, 2)
|
| 126 |
+
keys = keys.transpose(1, 2)
|
| 127 |
+
values = values.transpose(1, 2)
|
| 128 |
+
scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
|
| 129 |
+
if mask is not None:
|
| 130 |
+
scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen)
|
| 131 |
+
scores = F.softmax(scores.float(), dim=-1).type_as(xq)
|
| 132 |
+
output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim)
|
| 133 |
+
output = output.transpose(
|
| 134 |
+
1, 2
|
| 135 |
+
).contiguous().view(bsz, seqlen, -1)
|
| 136 |
+
|
| 137 |
+
return self.wo(output)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class FeedForward(nn.Module):
|
| 141 |
+
def __init__(
|
| 142 |
+
self,
|
| 143 |
+
dim: int,
|
| 144 |
+
hidden_dim: int,
|
| 145 |
+
multiple_of: int,
|
| 146 |
+
):
|
| 147 |
+
super().__init__()
|
| 148 |
+
hidden_dim = int(2 * hidden_dim / 3)
|
| 149 |
+
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
| 150 |
+
|
| 151 |
+
self.w1 = skip_init(nn.Linear,
|
| 152 |
+
dim,
|
| 153 |
+
hidden_dim,
|
| 154 |
+
bias=False,
|
| 155 |
+
)
|
| 156 |
+
self.w2 = skip_init(nn.Linear,
|
| 157 |
+
hidden_dim,
|
| 158 |
+
dim,
|
| 159 |
+
bias=False,
|
| 160 |
+
)
|
| 161 |
+
self.w3 = skip_init(nn.Linear,
|
| 162 |
+
dim,
|
| 163 |
+
hidden_dim,
|
| 164 |
+
bias=False,
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
def forward(self, x):
|
| 168 |
+
return self.w2(F.silu(self.w1(x)) * self.w3(x))
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class TransformerBlock(nn.Module):
|
| 172 |
+
def __init__(self, layer_id: int, args: ModelArgs):
|
| 173 |
+
super().__init__()
|
| 174 |
+
self.n_heads = args.n_heads
|
| 175 |
+
self.dim = args.dim
|
| 176 |
+
self.head_dim = args.dim // args.n_heads
|
| 177 |
+
self.attention = Attention(args)
|
| 178 |
+
self.feed_forward = FeedForward(
|
| 179 |
+
dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
|
| 180 |
+
)
|
| 181 |
+
self.layer_id = layer_id
|
| 182 |
+
self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
|
| 183 |
+
self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
|
| 184 |
+
|
| 185 |
+
def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
|
| 186 |
+
h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
|
| 187 |
+
out = h + self.feed_forward.forward(self.ffn_norm(h))
|
| 188 |
+
return out
|
| 189 |
+
|
| 190 |
+
# https://github.com/gmorenz/llama/commit/4daf7f1a2f2bb22208b5d464bc2a18511d54408d
|
| 191 |
+
def move_parameters_to_gpu(module):
|
| 192 |
+
if not hasattr(module, "saved"):
|
| 193 |
+
module.saved = module._parameters.copy()
|
| 194 |
+
for k, param in module.saved.items():
|
| 195 |
+
if param is not None:
|
| 196 |
+
module._parameters[k] = param.to("cuda", non_blocking=True)
|
| 197 |
+
for child in module.children():
|
| 198 |
+
move_parameters_to_gpu(child)
|
| 199 |
+
|
| 200 |
+
def move_parameters_to_cpu(module):
|
| 201 |
+
for k, param in module.saved.items():
|
| 202 |
+
del module._parameters[k]
|
| 203 |
+
module._parameters[k] = param
|
| 204 |
+
for child in module.children():
|
| 205 |
+
move_parameters_to_cpu(child)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
class Transformer(nn.Module):
|
| 209 |
+
def __init__(self, params: ModelArgs):
|
| 210 |
+
super().__init__()
|
| 211 |
+
self.params = params
|
| 212 |
+
self.vocab_size = params.vocab_size
|
| 213 |
+
self.n_layers = params.n_layers
|
| 214 |
+
|
| 215 |
+
self.tok_embeddings = skip_init(nn.Embedding,
|
| 216 |
+
params.vocab_size,
|
| 217 |
+
params.dim,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
self.layers = torch.nn.ModuleList()
|
| 221 |
+
for layer_id in range(params.n_layers):
|
| 222 |
+
self.layers.append(TransformerBlock(layer_id, params))
|
| 223 |
+
|
| 224 |
+
self.layer_locations = [None] * len(self.layers)
|
| 225 |
+
|
| 226 |
+
self.norm = RMSNorm(params.dim, eps=params.norm_eps).cuda()
|
| 227 |
+
self.output = skip_init(nn.Linear,
|
| 228 |
+
params.dim,
|
| 229 |
+
params.vocab_size,
|
| 230 |
+
bias=False,
|
| 231 |
+
).cuda()
|
| 232 |
+
|
| 233 |
+
self.freqs_cis = precompute_freqs_cis(
|
| 234 |
+
self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
|
| 235 |
+
).cuda()
|
| 236 |
+
|
| 237 |
+
@torch.inference_mode()
|
| 238 |
+
def forward(self, tokens: torch.Tensor, start_pos: int):
|
| 239 |
+
use_gpu = True # start_pos == 0
|
| 240 |
+
|
| 241 |
+
_bsz, seqlen = tokens.shape
|
| 242 |
+
h = self.tok_embeddings(tokens)
|
| 243 |
+
self.freqs_cis = self.freqs_cis
|
| 244 |
+
freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
|
| 245 |
+
if use_gpu:
|
| 246 |
+
h = h.cuda()
|
| 247 |
+
|
| 248 |
+
mask = None
|
| 249 |
+
if seqlen > 1:
|
| 250 |
+
mask = torch.full(
|
| 251 |
+
(1, 1, seqlen, seqlen), float("-inf"), device=tokens.device
|
| 252 |
+
)
|
| 253 |
+
mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
|
| 254 |
+
|
| 255 |
+
if use_gpu and mask is not None:
|
| 256 |
+
mask = mask.cuda()
|
| 257 |
+
|
| 258 |
+
for layer in tqdm(self.layers, desc="flayers", leave=True):
|
| 259 |
+
if use_gpu:
|
| 260 |
+
move_parameters_to_gpu(layer)
|
| 261 |
+
h = layer(h, start_pos, freqs_cis, mask)
|
| 262 |
+
if use_gpu:
|
| 263 |
+
move_parameters_to_cpu(layer)
|
| 264 |
+
|
| 265 |
+
h = self.norm(h)
|
| 266 |
+
if use_gpu:
|
| 267 |
+
del mask
|
| 268 |
+
torch.cuda.empty_cache()
|
| 269 |
+
output = self.output(h[:, -1, :]) # only compute last logits
|
| 270 |
+
return output.float()
|
llama/tokenizer.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from sentencepiece import SentencePieceProcessor
|
| 5 |
+
from logging import getLogger
|
| 6 |
+
from typing import List
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
logger = getLogger()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Tokenizer:
|
| 14 |
+
def __init__(self, model_path: str):
|
| 15 |
+
# reload tokenizer
|
| 16 |
+
assert os.path.isfile(model_path), model_path
|
| 17 |
+
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
| 18 |
+
logger.info(f"Reloaded SentencePiece model from {model_path}")
|
| 19 |
+
|
| 20 |
+
# BOS / EOS token IDs
|
| 21 |
+
self.n_words: int = self.sp_model.vocab_size()
|
| 22 |
+
self.bos_id: int = self.sp_model.bos_id()
|
| 23 |
+
self.eos_id: int = self.sp_model.eos_id()
|
| 24 |
+
self.pad_id: int = self.sp_model.pad_id()
|
| 25 |
+
logger.info(
|
| 26 |
+
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
|
| 27 |
+
)
|
| 28 |
+
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
| 29 |
+
|
| 30 |
+
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
| 31 |
+
assert type(s) is str
|
| 32 |
+
t = self.sp_model.encode(s)
|
| 33 |
+
if bos:
|
| 34 |
+
t = [self.bos_id] + t
|
| 35 |
+
if eos:
|
| 36 |
+
t = t + [self.eos_id]
|
| 37 |
+
return t
|
| 38 |
+
|
| 39 |
+
def decode(self, t: List[int]) -> str:
|
| 40 |
+
return self.sp_model.decode(t)
|
merge-weights.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Original copyright by Jason Phang
|
| 2 |
+
# https://github.com/zphang
|
| 3 |
+
# Taken here
|
| 4 |
+
# https://github.com/huggingface/transformers/pull/21955/commits/8978f28e6c44b083c0b190d3931902c2904c940a#diff-110a445233a8b15a0875998eeaf75cb8607b38a5daa736291dd058766879bbdd
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import shutil
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
"""
|
| 13 |
+
Sample usage:
|
| 14 |
+
```
|
| 15 |
+
python merge_weights.py --input_dir D:\Downloads\LLaMA --model_size 13B
|
| 16 |
+
```
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
INTERMEDIATE_SIZE_MAP = {
|
| 20 |
+
"7B": 11008,
|
| 21 |
+
"13B": 13824,
|
| 22 |
+
"30B": 17920,
|
| 23 |
+
"65B": 22016,
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
NUM_SHARDS = {
|
| 27 |
+
"7B": 1,
|
| 28 |
+
"13B": 2,
|
| 29 |
+
"30B": 4,
|
| 30 |
+
"65B": 8,
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def read_json(path):
|
| 35 |
+
with open(path, "r") as f:
|
| 36 |
+
return json.loads(f.read())
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def write_model(input_base_path, model_size):
|
| 40 |
+
assert model_size in INTERMEDIATE_SIZE_MAP
|
| 41 |
+
|
| 42 |
+
params = read_json(os.path.join(input_base_path, "params.json"))
|
| 43 |
+
num_shards = NUM_SHARDS[model_size]
|
| 44 |
+
n_layers = params["n_layers"]
|
| 45 |
+
n_heads = params["n_heads"]
|
| 46 |
+
n_heads_per_shard = n_heads // num_shards
|
| 47 |
+
dim = params["dim"]
|
| 48 |
+
dims_per_head = dim // n_heads
|
| 49 |
+
|
| 50 |
+
# Load weights
|
| 51 |
+
if model_size == "7B":
|
| 52 |
+
loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cuda")
|
| 53 |
+
else:
|
| 54 |
+
loaded = [
|
| 55 |
+
torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cuda")
|
| 56 |
+
for i in range(num_shards)
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
state_dict = {}
|
| 60 |
+
|
| 61 |
+
for layer_i in range(n_layers):
|
| 62 |
+
if model_size == "7B":
|
| 63 |
+
state_dict |= {
|
| 64 |
+
f"layers.{layer_i}.attention.wq.weight": loaded[
|
| 65 |
+
f"layers.{layer_i}.attention.wq.weight"
|
| 66 |
+
],
|
| 67 |
+
f"layers.{layer_i}.attention.wk.weight": loaded[
|
| 68 |
+
f"layers.{layer_i}.attention.wk.weight"
|
| 69 |
+
],
|
| 70 |
+
f"layers.{layer_i}.attention.wv.weight": loaded[
|
| 71 |
+
f"layers.{layer_i}.attention.wv.weight"
|
| 72 |
+
],
|
| 73 |
+
f"layers.{layer_i}.attention.wo.weight": loaded[
|
| 74 |
+
f"layers.{layer_i}.attention.wo.weight"
|
| 75 |
+
],
|
| 76 |
+
f"layers.{layer_i}.feed_forward.w1.weight": loaded[
|
| 77 |
+
f"layers.{layer_i}.feed_forward.w1.weight"
|
| 78 |
+
],
|
| 79 |
+
f"layers.{layer_i}.feed_forward.w2.weight": loaded[
|
| 80 |
+
f"layers.{layer_i}.feed_forward.w2.weight"
|
| 81 |
+
],
|
| 82 |
+
f"layers.{layer_i}.feed_forward.w3.weight": loaded[
|
| 83 |
+
f"layers.{layer_i}.feed_forward.w3.weight"
|
| 84 |
+
],
|
| 85 |
+
f"layers.{layer_i}.attention_norm.weight": loaded[
|
| 86 |
+
f"layers.{layer_i}.attention_norm.weight"
|
| 87 |
+
],
|
| 88 |
+
f"layers.{layer_i}.ffn_norm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
|
| 89 |
+
}
|
| 90 |
+
else:
|
| 91 |
+
state_dict |= {
|
| 92 |
+
f"layers.{layer_i}.attention_norm.weight": loaded[0][
|
| 93 |
+
f"layers.{layer_i}.attention_norm.weight"
|
| 94 |
+
],
|
| 95 |
+
f"layers.{layer_i}.ffn_norm.weight": loaded[0][f"layers.{layer_i}.ffn_norm.weight"],
|
| 96 |
+
}
|
| 97 |
+
state_dict[f"layers.{layer_i}.attention.wq.weight"] = torch.cat(
|
| 98 |
+
[
|
| 99 |
+
loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
|
| 100 |
+
for i in range(num_shards)
|
| 101 |
+
],
|
| 102 |
+
dim=0,
|
| 103 |
+
).reshape(dim, dim)
|
| 104 |
+
state_dict[f"layers.{layer_i}.attention.wk.weight"] = torch.cat(
|
| 105 |
+
[
|
| 106 |
+
loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim)
|
| 107 |
+
for i in range(num_shards)
|
| 108 |
+
],
|
| 109 |
+
dim=0,
|
| 110 |
+
).reshape(dim, dim)
|
| 111 |
+
state_dict[f"layers.{layer_i}.attention.wv.weight"] = torch.cat(
|
| 112 |
+
[
|
| 113 |
+
loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim)
|
| 114 |
+
for i in range(num_shards)
|
| 115 |
+
],
|
| 116 |
+
dim=0,
|
| 117 |
+
).reshape(dim, dim)
|
| 118 |
+
state_dict[f"layers.{layer_i}.attention.wo.weight"] = torch.cat(
|
| 119 |
+
[loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
|
| 120 |
+
)
|
| 121 |
+
state_dict[f"layers.{layer_i}.feed_forward.w1.weight"] = torch.cat(
|
| 122 |
+
[loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
|
| 123 |
+
)
|
| 124 |
+
state_dict[f"layers.{layer_i}.feed_forward.w2.weight"] = torch.cat(
|
| 125 |
+
[loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
|
| 126 |
+
)
|
| 127 |
+
state_dict[f"layers.{layer_i}.feed_forward.w3.weight"] = torch.cat(
|
| 128 |
+
[loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
if model_size == "7B":
|
| 132 |
+
state_dict |= {
|
| 133 |
+
"tok_embeddings.weight": loaded["tok_embeddings.weight"],
|
| 134 |
+
"norm.weight": loaded["norm.weight"],
|
| 135 |
+
"output.weight": loaded["output.weight"],
|
| 136 |
+
}
|
| 137 |
+
else:
|
| 138 |
+
state_dict |= {
|
| 139 |
+
"norm.weight": loaded[0]["norm.weight"],
|
| 140 |
+
"tok_embeddings.weight": torch.cat(
|
| 141 |
+
[loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
|
| 142 |
+
),
|
| 143 |
+
"output.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
torch.save(state_dict, 'merged.pth')
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def main():
|
| 150 |
+
parser = argparse.ArgumentParser()
|
| 151 |
+
parser.add_argument(
|
| 152 |
+
"--input_dir",
|
| 153 |
+
help="Location of LLaMA weights, which contains tokenizer.model and model folders",
|
| 154 |
+
)
|
| 155 |
+
parser.add_argument(
|
| 156 |
+
"--model_size",
|
| 157 |
+
choices=["7B", "13B", "30B", "65B"],
|
| 158 |
+
)
|
| 159 |
+
args = parser.parse_args()
|
| 160 |
+
|
| 161 |
+
write_model(
|
| 162 |
+
input_base_path=os.path.join(args.input_dir, args.model_size),
|
| 163 |
+
model_size=args.model_size,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
if __name__ == "__main__":
|
| 168 |
+
main()
|
python3.10
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fairscale
|
| 2 |
+
fire
|
| 3 |
+
sentencepiece
|
| 4 |
+
tqdm
|
| 5 |
+
pyarrow
|
setup.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
|
| 3 |
+
|
| 4 |
+
from setuptools import setup, find_packages
|
| 5 |
+
|
| 6 |
+
setup(name="llama", version="0.0.0", packages=find_packages())
|
tokenizer/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
tokenizer/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|