CoderCowMoo commited on
Commit
3974b7a
β€’
1 Parent(s): 267b4cd

Change to use GPTQ so i dont get limited

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
- from transformers import BitsAndBytesConfig
5
  import spaces
6
  import torch
7
  from safetensors import safe_open
@@ -11,10 +11,18 @@ from torch import Tensor
11
  from threading import Thread
12
  import einops
13
 
14
-
15
- tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-LLaMA-3-70B-Instruct")
16
- quantization_config = BitsAndBytesConfig(load_in_4_bit=True)
17
- model = AutoModelForCausalLM.from_pretrained("NousResearch/Meta-LLaMA-3-70B-Instruct", quantization_config, device_map="cuda" ).eval()
 
 
 
 
 
 
 
 
18
 
19
 
20
  @spaces.GPU
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
5
  import spaces
6
  import torch
7
  from safetensors import safe_open
 
11
  from threading import Thread
12
  import einops
13
 
14
+ model_id = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GPTQ"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
16
+ quantize_config = BaseQuantizeConfig(
17
+ bits=4,
18
+ group_size=128,
19
+ desc_act=False
20
+ )
21
+ model = AutoGPTQForCausalLM.from_quantized(
22
+ model_id,
23
+ use_safetensors=True,
24
+ device="cuda",
25
+ quantize_config=quantize_config).eval()
26
 
27
 
28
  @spaces.GPU