Spaces:

Techbite
/

faq-rag-chatbot

Sleeping

File size: 5,273 Bytes

26d1a81

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from typing import List, Dict, Any
import gc

class ResponseGenerator:
    def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.1"):
        """
        Initialize the response generator with an LLM
        Optimized for 8-11GB GPU
        """
        print(f"Loading LLM: {model_name}")
        print("This may take a few minutes...")
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Configure device and data type based on available resources
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Free up memory before loading model
        gc.collect()
        if device == "cuda":
            torch.cuda.empty_cache()
        
        # Configure 4-bit quantization for maximum memory efficiency
        try:
            # Use 4-bit quantization for models that support it
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4"
            )
            
            # Load the model with quantization
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quantization_config,
                device_map="auto",
                torch_dtype=torch.float16,
                # Load model in parts to avoid OOM errors
                max_memory={0: "8GiB", "cpu": "16GiB"},
                offload_folder="offload",
                offload_state_dict=True,  # Offload weights to CPU when not in use
                low_cpu_mem_usage=True
            )
        except Exception as e:
            print(f"4-bit quantization error: {e}")
            print("Falling back to 8-bit quantization...")
            
            try:
                # Try 8-bit quantization
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    bnb_8bit_use_double_quant=True
                )
                
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    quantization_config=quantization_config,
                    device_map="auto",
                    torch_dtype=torch.float16,
                    max_memory={0: "8GiB", "cpu": "16GiB"},
                    offload_folder="offload",
                    low_cpu_mem_usage=True
                )
            except Exception as e2:
                print(f"8-bit quantization error: {e2}")
                print("Falling back to smaller model...")
                
                # Use a much smaller model as fallback
                backup_model = "microsoft/phi-2"
                self.tokenizer = AutoTokenizer.from_pretrained(backup_model)
                self.model = AutoModelForCausalLM.from_pretrained(
                    backup_model,
                    device_map="auto",
                    torch_dtype=torch.float16 if device == "cuda" else torch.float32
                )
        
        print("LLM loaded successfully")
    
    def generate_response(self, query: str, relevant_faqs: List[Dict[str, Any]]) -> str:
        """
        Generate a response using the LLM with retrieved FAQs as context
        Memory-optimized version
        """
        # Create prompt with relevant FAQs
        prompt = self._create_prompt(query, relevant_faqs)
        
        # Generate response with memory-efficient settings
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        with torch.no_grad():
            # Use more conservative generation parameters
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=200,  # Shorter response for memory efficiency
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract just the response part (after the prompt)
        response = response[len(prompt):].strip()
        
        # Clear GPU memory after generating response
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return response
    
    def _create_prompt(self, query: str, relevant_faqs: List[Dict[str, Any]]) -> str:
        """
        Create a prompt for the LLM with retrieved FAQs as context
        """
        # Format FAQs in a way that's suitable for the model
        faq_context = "\n\n".join([
            f"Q: {faq['question']}\nA: {faq['answer']}" 
            for faq in relevant_faqs
        ])
        
        # Create the prompt
        prompt = f"""
Below are some relevant e-commerce customer support FAQ entries:

{faq_context}

Based on the information above, please provide a helpful, accurate, and concise response to the following customer query:
Customer Query: {query}

Response:
"""
        return prompt