Spaces:

ksh-nyp
/

TCM-QNA

Runtime error

ksh-nyp commited on Feb 19

Commit

9e4e68e

•

1 Parent(s): 9005c68

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -42,11 +42,45 @@ packing = False
 # Load the entire model on the GPU 0
 device_map = {"": 0}
-from transformers import pipeline
 # Initialize the pipeline with the LLaMA model
 model_name = "ksh-nyp/llama-2-7b-chat-TCMKB2"
 pipe = pipeline("text-generation", model=model_name)
 def generate_text(prompt):
     # Generate text based on the input prompt

 # Load the entire model on the GPU 0
 device_map = {"": 0}
+# Load tokenizer and model with QLoRA configuration
+compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=use_4bit,
+    bnb_4bit_quant_type=bnb_4bit_quant_type,
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=use_nested_quant,
+)
+# Check GPU compatibility with bfloat16
+if compute_dtype == torch.float16 and use_4bit:
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 8:
+        print("=" * 80)
+        print("Your GPU supports bfloat16: accelerate training with bf16=True")
+        print("=" * 80)
 # Initialize the pipeline with the LLaMA model
 model_name = "ksh-nyp/llama-2-7b-chat-TCMKB2"
 pipe = pipeline("text-generation", model=model_name)
+# Load base model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map=device_map
+)
+model.config.use_cache = False
+model.config.pretraining_tp = 1
+# Load LLaMA tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
+from transformers import pipeline
 def generate_text(prompt):
     # Generate text based on the input prompt