arjunanand13 commited on
Commit
1a8e9fc
·
verified ·
1 Parent(s): 0501f59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -42
app.py CHANGED
@@ -30,49 +30,51 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
30
  model_id = 'meta-llama/Meta-Llama-3-8B'
31
  device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
32
 
33
- # set quantization configuration to load large model with less GPU memory
34
- # this requires the `bitsandbytes` library
35
- # bnb_config = transformers.BitsAndBytesConfig(
36
- # load_in_4bit=True,
37
- # bnb_4bit_quant_type='nf4',
38
- # bnb_4bit_use_double_quant=True,
39
- # bnb_4bit_compute_dtype=bfloat16
40
- # )
41
 
42
- # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct",token=HF_TOKEN)
43
- # model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto",token=HF_TOKEN) # to("cuda:0")
44
- # terminators = [
45
- # tokenizer.eos_token_id,
46
- # tokenizer.convert_tokens_to_ids("<|eot_id|>")
47
- # ]
48
-
49
-
50
- model_config = transformers.AutoConfig.from_pretrained(
51
- model_id,
52
- token=HF_TOKEN,
53
- # use_auth_token=hf_auth
54
- )
55
- model = transformers.AutoModelForCausalLM.from_pretrained(
56
- model_id,
57
- trust_remote_code=True,
58
- config=model_config,
59
- # quantization_config=bnb_config,
60
- token=HF_TOKEN,
61
- # use_auth_token=hf_auth
62
- )
63
- model.eval()
64
- tokenizer = transformers.AutoTokenizer.from_pretrained(
65
- model_id,
66
- token=HF_TOKEN,
67
- # use_auth_token=hf_auth
68
- )
69
- generate_text = transformers.pipeline(
70
- model=self.model, tokenizer=self.tokenizer,
71
- return_full_text=True,
72
- task='text-generation',
73
- temperature=0.01,
74
- max_new_tokens=512
75
- )
 
 
 
 
 
 
 
 
 
 
76
 
77
  """
78
  Setting up the stop list to define stopping criteria.
 
30
  model_id = 'meta-llama/Meta-Llama-3-8B'
31
  device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
32
 
 
 
 
 
 
 
 
 
33
 
34
+ """set quantization configuration to load large model with less GPU memory
35
+ this requires the `bitsandbytes` library"""
36
+ bnb_config = transformers.BitsAndBytesConfig(
37
+ load_in_4bit=True,
38
+ bnb_4bit_quant_type='nf4',
39
+ bnb_4bit_use_double_quant=True,
40
+ bnb_4bit_compute_dtype=bfloat16
41
+ )
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct",token=HF_TOKEN)
44
+ model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto",token=HF_TOKEN) # to("cuda:0")
45
+ terminators = [
46
+ tokenizer.eos_token_id,
47
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
48
+ ]
49
+
50
+ """CPU"""
51
+
52
+ # model_config = transformers.AutoConfig.from_pretrained(
53
+ # model_id,
54
+ # token=HF_TOKEN,
55
+ # # use_auth_token=hf_auth
56
+ # )
57
+ # model = transformers.AutoModelForCausalLM.from_pretrained(
58
+ # model_id,
59
+ # trust_remote_code=True,
60
+ # config=model_config,
61
+ # # quantization_config=bnb_config,
62
+ # token=HF_TOKEN,
63
+ # # use_auth_token=hf_auth
64
+ # )
65
+ # model.eval()
66
+ # tokenizer = transformers.AutoTokenizer.from_pretrained(
67
+ # model_id,
68
+ # token=HF_TOKEN,
69
+ # # use_auth_token=hf_auth
70
+ # )
71
+ # generate_text = transformers.pipeline(
72
+ # model=self.model, tokenizer=self.tokenizer,
73
+ # return_full_text=True,
74
+ # task='text-generation',
75
+ # temperature=0.01,
76
+ # max_new_tokens=512
77
+ # )
78
 
79
  """
80
  Setting up the stop list to define stopping criteria.