Ashishkr commited on
Commit
b1db2e4
1 Parent(s): 7ba290d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +45 -25
README.md CHANGED
@@ -1,7 +1,6 @@
1
  ---
2
  tags:
3
  - autotrain
4
- - conversational
5
  - meta-llama
6
  - meta-llama/Llama-2-7b-hf
7
  inference: true
@@ -18,36 +17,56 @@ widget:
18
 
19
  response: ''
20
  library_name: peft
 
21
  ---
22
 
23
- ```python
24
- !huggingface-cli login
25
-
26
-
27
- _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
28
- _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
29
- _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
30
- _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
31
- _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
32
-
33
- To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
34
- Token: <your-hf-access-token>
35
- ```
36
-
37
 
38
  ```python
39
-
40
  from peft import PeftModel, PeftConfig
41
- from transformers import AutoModelForCausalLM
42
- from transformers import AutoTokenizer
43
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
46
 
47
  config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation")
48
- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
49
  model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device)
50
- tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
 
 
 
 
 
 
 
 
 
51
 
52
  ```
53
 
@@ -60,7 +79,6 @@ def llama_generate(
60
  prompt: str,
61
  max_new_tokens: int = 128,
62
  temperature: float = 0.92):
63
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
 
65
  inputs = tokenizer(
66
  [prompt],
@@ -70,7 +88,10 @@ def llama_generate(
70
  device
71
  )
72
 
73
- with torch.autocast("cuda", dtype=torch.bfloat16):
 
 
 
74
  response = model.generate(
75
  **inputs,
76
  max_new_tokens=max_new_tokens,
@@ -87,7 +108,6 @@ def llama_generate(
87
 
88
  return decoded_output[len(prompt) :]
89
 
90
-
91
  prompt = """
92
  instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n
93
 
@@ -97,7 +117,7 @@ goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
97
  but this is different. Also, my moods have been horrible for the past few weeks.\n
98
 
99
  response: """
100
-
101
  response = llama_generate(
102
  model,
103
  tokenizer,
 
1
  ---
2
  tags:
3
  - autotrain
 
4
  - meta-llama
5
  - meta-llama/Llama-2-7b-hf
6
  inference: true
 
17
 
18
  response: ''
19
  library_name: peft
20
+ pipeline_tag: text-generation
21
  ---
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  ```python
25
+ import transformers
26
  from peft import PeftModel, PeftConfig
27
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
28
  import torch
29
+ from torch import cuda, bfloat16
30
+
31
+ base_model_id = 'meta-llama/Llama-2-7b-chat-hf'
32
+
33
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
34
+
35
+ bnb_config = transformers.BitsAndBytesConfig(
36
+ load_in_4bit=True,
37
+ bnb_4bit_quant_type='nf4',
38
+ bnb_4bit_use_double_quant=True,
39
+ bnb_4bit_compute_dtype=bfloat16
40
+ )
41
+
42
+
43
+ hf_auth = "your-huggingface-access-token"
44
+ model_config = transformers.AutoConfig.from_pretrained(
45
+ base_model_id,
46
+ use_auth_token=hf_auth
47
+ )
48
 
49
+ model = transformers.AutoModelForCausalLM.from_pretrained(
50
+ base_model_id,
51
+ trust_remote_code=True,
52
+ config=model_config,
53
+ quantization_config=bnb_config,
54
+ device_map='auto',
55
+ use_auth_token=hf_auth
56
+ )
57
 
58
  config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation")
 
59
  model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device)
60
+
61
+ model.eval()
62
+ print(f"Model loaded on {device}")
63
+
64
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
65
+ model_id,
66
+ use_auth_token=hf_auth
67
+ )
68
+
69
+
70
 
71
  ```
72
 
 
79
  prompt: str,
80
  max_new_tokens: int = 128,
81
  temperature: float = 0.92):
 
82
 
83
  inputs = tokenizer(
84
  [prompt],
 
88
  device
89
  )
90
 
91
+ # Check if bfloat16 is supported, otherwise use float16
92
+ dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
93
+
94
+ with torch.autocast("cuda", dtype=dtype_to_use):
95
  response = model.generate(
96
  **inputs,
97
  max_new_tokens=max_new_tokens,
 
108
 
109
  return decoded_output[len(prompt) :]
110
 
 
111
  prompt = """
112
  instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n
113
 
 
117
  but this is different. Also, my moods have been horrible for the past few weeks.\n
118
 
119
  response: """
120
+ # You can use the function as before
121
  response = llama_generate(
122
  model,
123
  tokenizer,