Spaces:

ksh-nyp
/

TCM-QNA

Runtime error

App Files Files Community

ksh-nyp commited on Feb 19

Commit

b6cc421

•

1 Parent(s): 9e4e68e

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -80

app.py CHANGED Viewed

@@ -1,86 +1,9 @@
 import gradio as gr
-import os
-import torch
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    HfArgumentParser,
-    TrainingArguments,
-    pipeline,
-    logging,
-)
-################################################################################
-# bitsandbytes parameters
-################################################################################
-# Activate 4-bit precision base model loading
-use_4bit = True
-# Compute dtype for 4-bit base models
-bnb_4bit_compute_dtype = "float16"
-# Quantization type (fp4 or nf4)
-bnb_4bit_quant_type = "nf4"
-# Activate nested quantization for 4-bit base models (double quantization)
-use_nested_quant = False
-################################################################################
-# SFT parameters
-################################################################################
-# Maximum sequence length to use
-max_seq_length = None
-# Pack multiple short examples in the same input sequence to increase efficiency
-packing = False
-# Load the entire model on the GPU 0
-device_map = {"": 0}
-# Load tokenizer and model with QLoRA configuration
-compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=use_4bit,
-    bnb_4bit_quant_type=bnb_4bit_quant_type,
-    bnb_4bit_compute_dtype=compute_dtype,
-    bnb_4bit_use_double_quant=use_nested_quant,
-)
-# Check GPU compatibility with bfloat16
-if compute_dtype == torch.float16 and use_4bit:
-    major, _ = torch.cuda.get_device_capability()
-    if major >= 8:
-        print("=" * 80)
-        print("Your GPU supports bfloat16: accelerate training with bf16=True")
-        print("=" * 80)
-# Initialize the pipeline with the LLaMA model
-model_name = "ksh-nyp/llama-2-7b-chat-TCMKB2"
-pipe = pipeline("text-generation", model=model_name)
-# Load base model
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=bnb_config,
-    device_map=device_map
-)
-model.config.use_cache = False
-model.config.pretraining_tp = 1
-# Load LLaMA tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
 from transformers import pipeline
 def generate_text(prompt):
     # Generate text based on the input prompt

 import gradio as gr
 from transformers import pipeline
+# Initialize the pipeline with the model for CPU usage
+model_name = "ksh-nyp/llama-2-7b-chat-TCMKB2"
+pipe = pipeline("text-generation", model=model_name, device=0)  # device=0 for CPU
 def generate_text(prompt):
     # Generate text based on the input prompt