Spaces:
Runtime error
Runtime error
# model.py | |
import logging | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig | |
# Logger configuration | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s [%(levelname)s] %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S') | |
logger = logging.getLogger(__name__) | |
#model_path = "/opt/Llama-2-13B-chat-GPTQ" | |
class Model: | |
def __init__(self, model_path): | |
self.model_name = model_path | |
self.model = None | |
self.tokenizer = None | |
self.loaded = False | |
def load(self, precision='fp16'): | |
try: | |
# Check if CUDA is available | |
if not torch.cuda.is_available(): | |
raise EnvironmentError("CUDA not available.") | |
# Set precision settings | |
if precision == 'fp16': | |
torch_dtype = torch.float16 | |
else: | |
torch_dtype = torch.float32 | |
# Initialize tokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
# Set up model configuration | |
config = AutoConfig.from_pretrained(self.model_name) | |
#config.quantization_config["disable_exllama"] = False | |
#config.quantization_config["use_exllama"] = True | |
#config.quantization_config["exllama_config"] = {"version": 2} | |
# Load model with configuration and precision | |
self.model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
config=config, | |
device_map="cuda:0", # Set to GPU 0 | |
torch_dtype=torch_dtype | |
) | |
self.loaded = True | |
logger.info(f"Model loaded successfully on GPU with {precision} precision.") | |
except Exception as e: | |
logger.error(f"Error loading model: {e}") | |
def predict(self, input_text, max_length=50): | |
if not self.loaded: | |
logger.error("Model not loaded. Please load the model before prediction.") | |
return None | |
logger.info("========== Start Prediction ==========") | |
try: | |
# Ensure the input_text is a string | |
if not isinstance(input_text, str): | |
raise ValueError("Input text must be a string.") | |
# Encoding the input text | |
input_ids = self.tokenizer.encode(input_text, return_tensors='pt') | |
# Move input to the same device as model | |
input_ids = input_ids.to(next(self.model.parameters()).device) | |
# Generating output using the model | |
outputs = self.model.generate(input_ids, max_length=max_length) | |
# Decoding and returning the generated text | |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
logger.info("Response: {}".format(response)) | |
except Exception as e: | |
logger.error(f"Error during prediction: {e}") | |
response = None | |
logger.info("========== End Prediction ==========") | |
return response |