Spaces:
Sleeping
Sleeping
File size: 4,042 Bytes
26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 f402ae8 26d1a81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from typing import List, Dict, Any
import gc
import psutil
class ResponseGenerator:
def __init__(self, model_name: str = "microsoft/phi-2"):
"""
Initialize the response generator with an LLM
"""
print(f"Loading LLM: {model_name}")
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
gc.collect()
if self.device == "cuda":
torch.cuda.empty_cache()
try:
if self.device == "cuda":
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
available_memory = psutil.virtual_memory().total / (1024 ** 3)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
max_memory = {0: f"{min(gpu_memory, 15)}GiB", "cpu": f"{min(available_memory, 30)}GiB"}
print(f"Setting max_memory: {max_memory}")
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto",
torch_dtype=torch.float16,
max_memory=max_memory,
offload_folder="offload",
offload_state_dict=True,
low_cpu_mem_usage=True
)
else:
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map={"": "cpu"},
torch_dtype=torch.float32,
low_cpu_mem_usage=True
)
except Exception as e:
print(f"Model loading error: {e}")
print("Falling back to TinyLlama...")
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map={"": self.device},
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
)
print("LLM loaded successfully")
def generate_response(self, query: str, relevant_faqs: List[Dict[str, Any]]) -> str:
"""
Generate a response using the LLM with retrieved FAQs as context
"""
prompt = self._create_prompt(query, relevant_faqs)
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=150,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response[len(prompt):].strip()
if self.device == "cuda":
torch.cuda.empty_cache()
return response
def _create_prompt(self, query: str, relevant_faqs: List[Dict[str, Any]]) -> str:
"""
Create a prompt for the LLM with retrieved FAQs as context
"""
faq_context = "\n\n".join([f"Q: {faq['question']}\nA: {faq['answer']}" for faq in relevant_faqs])
prompt = f"""
Below are some relevant e-commerce customer support FAQ entries:
{faq_context}
Based on the information above, provide a helpful, accurate, and concise response to the following customer query:
Customer Query: {query}
Response:
"""
return prompt |