import torch from transformers import AutoModelForCausalLM, AutoTokenizer from config import MODEL_NAME import spaces model = None tokenizer = None @spaces.GPU def load_model(): global model, tokenizer if model is None or tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto" ) model.eval() return model, tokenizer @spaces.GPU def generate_response(message, history, max_length=512, temperature=0.7): model, tokenizer = load_model() # Prepare input if history: input_text = history + f"\nUser: {message}\nAssistant:" else: input_text = f"User: {message}\nAssistant:" inputs = tokenizer(input_text, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response if "Assistant:" in response: response = response.split("Assistant:")[-1].strip() return response