Spaces:

chansung
/

LLaMA-7B

Runtime error

App Files Files Community

chansung commited on Mar 3, 2023

Commit

7932e5a

•

1 Parent(s): de9bfc1

Create gen.py

Browse files

Files changed (1) hide show

gen.py +89 -0

gen.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from typing import Tuple
+import os
+import time
+import json
+from pathlib import Path
+import torch
+from fairscale.nn.model_parallel.initialize import initialize_model_parallel
+from app.llama import ModelArgs, Transformer, Tokenizer, LLaMA
+from google.cloud import storage
+bucket_name = os.environ.get("GCS_BUCKET")
+llama_weight_path = "weights/llama"
+tokenizer_weight_path = "weights/tokenizer"
+def setup_model_parallel() -> Tuple[int, int]:
+    local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    world_size = int(os.environ.get("WORLD_SIZE", -1))
+    torch.distributed.init_process_group("nccl")
+    initialize_model_parallel(world_size)
+    torch.cuda.set_device(local_rank)
+    # seed must be the same in all processes
+    torch.manual_seed(1)
+    return local_rank, world_size
+def download_pretrained_models(
+    ckpt_path: str,
+    tokenizer_path: str
+):
+    os.makedirs(llama_weight_path)
+    os.makedirs(tokenizer_weight_path)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=f"{ckpt_path}/")
+    for blob in blobs:
+        filename = blob.name.split("/")[1]
+        blob.download_to_filename(f"{llama_weight_path}/{filename}")
+    blobs = bucket.list_blobs(prefix=f"{tokenizer_path}/")
+    for blob in blobs:
+        filename = blob.name.split("/")[1]
+        blob.download_to_filename(f"{tokenizer_weight_path}/{filename}")
+def get_pretrained_models(
+    ckpt_path: str,
+    tokenizer_path: str,
+    local_rank: int,
+    world_size: int) -> LLaMA:
+    download_pretrained_models(ckpt_path, tokenizer_path)
+    start_time = time.time()
+    checkpoints = sorted(Path(llama_weight_path).glob("*.pth"))
+    llama_ckpt_path = checkpoints[local_rank]
+    print("Loading")
+    checkpoint = torch.load(llama_ckpt_path, map_location=lambda storage, loc: storage.cuda(0))
+    with open(Path(llama_weight_path) / "params.json", "r") as f:
+        params = json.loads(f.read())
+    model_args: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=1, **params)
+    tokenizer = Tokenizer(model_path=f"{tokenizer_weight_path}/tokenizer.model")
+    model_args.vocab_size = tokenizer.n_words
+    torch.set_default_tensor_type(torch.cuda.HalfTensor)
+    model = Transformer(model_args)
+    torch.set_default_tensor_type(torch.FloatTensor)
+    model.load_state_dict(checkpoint, strict=False)
+    generator = LLaMA(model, tokenizer)
+    print(f"Loaded in {time.time() - start_time:.2f} seconds")
+    return generator
+def get_output(
+    generator: LLaMA,
+    prompt: str,
+    temperature: float = 0.8,
+    top_p: float = 0.95):
+    prompts = [prompt]
+    results = generator.generate(prompts, max_gen_len=256, temperature=temperature, top_p=top_p)
+    return results