Spaces:

code5ecure
/

Yavar

Sleeping

App Files Files Community

code5ecure commited on Aug 18, 2025

Commit

73eb94f

verified ·

1 Parent(s): c52be7e

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -22

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 import gradio as gr
 from sentence_transformers import SentenceTransformer
 import faiss
 # Disable torch.compile to avoid meta device issues
 torch._dynamo.config.suppress_errors = True
@@ -12,15 +13,21 @@ torch.set_default_dtype(torch.float32)
 # Set device explicitly
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load LLaMA 2 Persian model and tokenizer
-model_name = "sinarashidi/llama-2-7b-chat-persian"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32).to(device)
 # Differential Privacy parameters
 epsilon = 1.0  # Privacy budget
 delta = 1e-5   # Privacy parameter
 sensitivity = 1.0  # Sensitivity of the query
 # Simple memory for conversation history
 conversation_history = []
@@ -48,14 +55,19 @@ def load_training_data():
 # Build RAG index
 def build_rag_index(texts):
     global embedder, index
-    embedder = SentenceTransformer('xmanii/maux-gte-persian')
-    embeddings = embedder.encode(texts, convert_to_tensor=True).cpu().numpy()
-    dimension = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    return embedder, index
-# Fine-tune model with differential privacy (skipped to use only pretrained LLaMA 2)
 def train_model():
     global texts, embedder, index
     texts = load_training_data()
@@ -65,8 +77,7 @@ def train_model():
     # Build RAG index
     build_rag_index(texts)
-    print("Skipping fine-tuning to use only pretrained LLaMA 2 model.")
 def add_noise(tensor, sensitivity, epsilon, delta):
     """Add Laplace noise for differential privacy."""
@@ -87,13 +98,15 @@ def chat(message, history):
     model.eval()
     # RAG retrieval
     if embedder and index:
-        query_emb = embedder.encode(message, convert_to_tensor=True).cpu().numpy()
-        D, I = index.search(query_emb, k=3)
-        retrieved = [texts[i] for i in I[0] if i >= 0 and i < len(texts)]
-        context = "\n".join(retrieved)
-    else:
-        context = ""
     # Prepare prompt with context
     prompt = f"Context: {context}\nUser: {message}\nBot:"
@@ -113,19 +126,26 @@ def chat(message, history):
         )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Update conversation history
     update_model(message, response)
     return response
-# Train the model on startup (now only loads data and builds RAG index)
 train_model()
 # Gradio interface
 iface = gr.ChatInterface(
     fn=chat,
-    title="LLaMA 2 Persian Chatbot with RAG",
-    description="Chat with pretrained LLaMA 2 Persian model using training_data.txt as RAG knowledge base."
 )
 if __name__ == "__main__":

 import gradio as gr
 from sentence_transformers import SentenceTransformer
 import faiss
+from bitsandbytes import quantize_model
 # Disable torch.compile to avoid meta device issues
 torch._dynamo.config.suppress_errors = True
 # Set device explicitly
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load LLaMA 3.2 1B model with 4-bit quantization
+model_name = "meta-llama/Llama-3.2-1B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    load_in_4bit=True,  # Enable 4-bit quantization
+    device_map="auto"   # Automatically map to available device
+).to(device)
 # Differential Privacy parameters
 epsilon = 1.0  # Privacy budget
 delta = 1e-5   # Privacy parameter
 sensitivity = 1.0  # Sensitivity of the query
+apply_dp = False  # Toggle differential privacy in inference (set to True to enable)
 # Simple memory for conversation history
 conversation_history = []
 # Build RAG index
 def build_rag_index(texts):
     global embedder, index
+    try:
+        embedder = SentenceTransformer('xmanii/maux-gte-persian', device='cpu')  # Use CPU to save memory
+        embeddings = embedder.encode(texts, convert_to_tensor=True, batch_size=16).cpu().numpy()  # Smaller batch size
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(embeddings)
+        print("RAG index built successfully")
+        return embedder, index
+    except Exception as e:
+        print(f"Error building RAG index: {e}")
+        return None, None
+# Initialize model and RAG (no fine-tuning)
 def train_model():
     global texts, embedder, index
     texts = load_training_data()
     # Build RAG index
     build_rag_index(texts)
+    print("Using pretrained LLaMA 3.2 1B model without fine-tuning.")
 def add_noise(tensor, sensitivity, epsilon, delta):
     """Add Laplace noise for differential privacy."""
     model.eval()
     # RAG retrieval
+    context = ""
     if embedder and index:
+        try:
+            query_emb = embedder.encode(message, convert_to_tensor=True).cpu().numpy()
+            D, I = index.search(query_emb, k=3)
+            retrieved = [texts[i] for i in I[0] if i >= 0 and i < len(texts)]
+            context = "\n".join(retrieved)
+        except Exception as e:
+            print(f"Error in RAG retrieval: {e}")
     # Prepare prompt with context
     prompt = f"Context: {context}\nUser: {message}\nBot:"
         )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Apply differential privacy noise to logits (optional)
+    if apply_dp:
+        logits = model(**inputs).logits
+        noisy_logits = add_noise(logits, sensitivity, epsilon, delta)
+        response_ids = torch.argmax(noisy_logits, dim=-1)
+        response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
     # Update conversation history
     update_model(message, response)
     return response
+# Initialize model and RAG (no fine-tuning)
 train_model()
 # Gradio interface
 iface = gr.ChatInterface(
     fn=chat,
+    title="LLaMA 3.2 1B Persian Chatbot with RAG",
+    description="Chat with pretrained LLaMA 3.2 1B model using training_data.txt as RAG knowledge base."
 )
 if __name__ == "__main__":