Update app.py
Browse files
app.py
CHANGED
@@ -9,18 +9,10 @@ from tokenization_yi import YiTokenizer
|
|
9 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
|
10 |
model_id = "TheBloke/Yi-34B-200K-Llamafied-GPTQ"
|
11 |
|
12 |
-
gptq_config = GPTQConfig(
|
13 |
-
bits=4,
|
14 |
-
exllama_config={"version": 2},
|
15 |
-
disable_exllama=True
|
16 |
-
)
|
17 |
tokenizer = YiTokenizer.from_pretrained("./")
|
18 |
-
model = AutoModelForCausalLM.from_pretrained(
|
19 |
-
model_id,
|
20 |
-
device_map="auto",
|
21 |
-
quantization_config=gptq_config
|
22 |
|
23 |
-
)
|
24 |
def run(message, chat_history, max_new_tokens=4056, temperature=3.5, top_p=0.9, top_k=800):
|
25 |
prompt = get_prompt(message, chat_history)
|
26 |
input_ids = tokenizer.encode(prompt, return_tensors='pt')
|
|
|
9 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
|
10 |
model_id = "TheBloke/Yi-34B-200K-Llamafied-GPTQ"
|
11 |
|
12 |
+
gptq_config = GPTQConfig( bits=4, exllama_config={"version": 2})
|
|
|
|
|
|
|
|
|
13 |
tokenizer = YiTokenizer.from_pretrained("./")
|
14 |
+
model = AutoModelForCausalLM.from_pretrained( model_id, device_map="cuda", quantization_config=gptq_config)
|
|
|
|
|
|
|
15 |
|
|
|
16 |
def run(message, chat_history, max_new_tokens=4056, temperature=3.5, top_p=0.9, top_k=800):
|
17 |
prompt = get_prompt(message, chat_history)
|
18 |
input_ids = tokenizer.encode(prompt, return_tensors='pt')
|