vmuchinov commited on
Commit
79a49f5
1 Parent(s): c9573ed

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +10 -3
  2. requirements.txt +0 -1
app.py CHANGED
@@ -6,8 +6,8 @@ import gradio as gr
6
  import spaces
7
  import torch
8
 
9
- from awq import AutoAWQForCausalLM
10
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
 
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -16,12 +16,19 @@ ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
16
 
17
  model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
18
 
19
- model = AutoAWQForCausalLM.from_pretrained(
 
 
 
 
 
 
20
  model_id,
21
  torch_dtype=torch.float16,
22
  device_map="auto",
23
  trust_remote_code=True,
24
  low_cpu_mem_usage=True,
 
25
  token=ACCESS_TOKEN)
26
  tokenizer = AutoTokenizer.from_pretrained(
27
  model_id,
 
6
  import spaces
7
  import torch
8
 
9
+
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AwqConfig
11
 
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
 
16
 
17
  model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
18
 
19
+ quantization_config = AwqConfig(
20
+ bits=4,
21
+ fuse_max_seq_len=512, # Note: Update this as per your use-case
22
+ do_fuse=True,
23
+ )
24
+
25
+ model = AutoModelForCausalLM.from_pretrained(
26
  model_id,
27
  torch_dtype=torch.float16,
28
  device_map="auto",
29
  trust_remote_code=True,
30
  low_cpu_mem_usage=True,
31
+ quantization_config=quantization_config,
32
  token=ACCESS_TOKEN)
33
  tokenizer = AutoTokenizer.from_pretrained(
34
  model_id,
requirements.txt CHANGED
@@ -246,4 +246,3 @@ einops
246
  pytest
247
  gguf>=0.10.0
248
  autoawq
249
- awq
 
246
  pytest
247
  gguf>=0.10.0
248
  autoawq