Spaces:

code5ecure
/

Yavar

Sleeping

App Files Files Community

code5ecure commited on Aug 18, 2025

Commit

408c301

verified ·

1 Parent(s): 729d634

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -6

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import numpy as np
 import gradio as gr
 from sentence_transformers import SentenceTransformer
 import faiss
-from bitsandbytes import quantize_model
 # Disable torch.compile to avoid meta device issues
 torch._dynamo.config.suppress_errors = True
@@ -13,14 +12,13 @@ torch.set_default_dtype(torch.float32)
 # Set device explicitly
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load LLaMA 3.2 1B model with 4-bit quantization
 model_name = "meta-llama/Llama-3.2-1B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    load_in_4bit=True,  # Enable 4-bit quantization
-    device_map="auto"   # Automatically map to available device
 ).to(device)
 # Differential Privacy parameters
@@ -57,7 +55,7 @@ def build_rag_index(texts):
     global embedder, index
     try:
         embedder = SentenceTransformer('xmanii/maux-gte-persian', device='cpu')  # Use CPU to save memory
-        embeddings = embedder.encode(texts, convert_to_tensor=True, batch_size=16).cpu().numpy()  # Smaller batch size
         dimension = embeddings.shape[1]
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings)
@@ -119,7 +117,7 @@ def chat(message, history):
         outputs = model.generate(
             input_ids=inputs["input_ids"],
             attention_mask=inputs["attention_mask"],
-            max_length=50,
             num_beams=5,
             no_repeat_ngram_size=2,
             early_stopping=True,

 import gradio as gr
 from sentence_transformers import SentenceTransformer
 import faiss
 # Disable torch.compile to avoid meta device issues
 torch._dynamo.config.suppress_errors = True
 # Set device explicitly
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load LLaMA 3.2 1B model (no quantization for CPU compatibility)
 model_name = "meta-llama/Llama-3.2-1B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"  # Automatically map to available device
 ).to(device)
 # Differential Privacy parameters
     global embedder, index
     try:
         embedder = SentenceTransformer('xmanii/maux-gte-persian', device='cpu')  # Use CPU to save memory
+        embeddings = embedder.encode(texts, convert_to_tensor=True, batch_size=8).cpu().numpy()  # Smaller batch size
         dimension = embeddings.shape[1]
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings)
         outputs = model.generate(
             input_ids=inputs["input_ids"],
             attention_mask=inputs["attention_mask"],
+            max_length=100,  # Increased for better responses
             num_beams=5,
             no_repeat_ngram_size=2,
             early_stopping=True,