Spaces:
Runtime error
Runtime error
File size: 2,720 Bytes
55be9e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import time
from dotenv import load_dotenv
from distutils.util import strtobool
from llama2_wrapper import LLAMA2_WRAPPER
def main():
load_dotenv()
DEFAULT_SYSTEM_PROMPT = (
os.getenv("DEFAULT_SYSTEM_PROMPT")
if os.getenv("DEFAULT_SYSTEM_PROMPT") is not None
else ""
)
MAX_MAX_NEW_TOKENS = (
int(os.getenv("MAX_MAX_NEW_TOKENS"))
if os.getenv("DEFAULT_MAX_NEW_TOKENS") is not None
else 2048
)
DEFAULT_MAX_NEW_TOKENS = (
int(os.getenv("DEFAULT_MAX_NEW_TOKENS"))
if os.getenv("DEFAULT_MAX_NEW_TOKENS") is not None
else 1024
)
MAX_INPUT_TOKEN_LENGTH = (
int(os.getenv("MAX_INPUT_TOKEN_LENGTH"))
if os.getenv("MAX_INPUT_TOKEN_LENGTH") is not None
else 4000
)
MODEL_PATH = os.getenv("MODEL_PATH")
assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
LOAD_IN_4BIT = bool(strtobool(os.getenv("LOAD_IN_4BIT", "True")))
LLAMA_CPP = bool(strtobool(os.getenv("LLAMA_CPP", "True")))
if LLAMA_CPP:
print("Running on CPU with llama.cpp.")
else:
import torch
if torch.cuda.is_available():
print("Running on GPU with torch transformers.")
else:
print("CUDA not found.")
config = {
"model_name": MODEL_PATH,
"load_in_8bit": LOAD_IN_8BIT,
"load_in_4bit": LOAD_IN_4BIT,
"llama_cpp": LLAMA_CPP,
"MAX_INPUT_TOKEN_LENGTH": MAX_INPUT_TOKEN_LENGTH,
}
tic = time.perf_counter()
llama2_wrapper = LLAMA2_WRAPPER(config)
llama2_wrapper.init_tokenizer()
llama2_wrapper.init_model()
toc = time.perf_counter()
print(f"Initialize the model in {toc - tic:0.4f} seconds.")
example = "Can you explain briefly to me what is the Python programming language?"
generator = llama2_wrapper.run(
example, [], DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS, 1, 0.95, 50
)
tic = time.perf_counter()
try:
first_response = next(generator)
# history += [(example, first_response)]
# print(first_response)
except StopIteration:
pass
# history += [(example, "")]
# print(history)
for response in generator:
# history += [(example, response)]
# print(response)
pass
print(response)
toc = time.perf_counter()
output_token_length = llama2_wrapper.get_token_length(response)
print(f"Generating the out in {toc - tic:0.4f} seconds.")
print(f"Speed: {output_token_length / (toc - tic):0.4f} tokens/sec.")
if __name__ == "__main__":
main()
|