code5ecure commited on
Commit
408c301
·
verified ·
1 Parent(s): 729d634

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -6
app.py CHANGED
@@ -4,7 +4,6 @@ import numpy as np
4
  import gradio as gr
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
- from bitsandbytes import quantize_model
8
 
9
  # Disable torch.compile to avoid meta device issues
10
  torch._dynamo.config.suppress_errors = True
@@ -13,14 +12,13 @@ torch.set_default_dtype(torch.float32)
13
  # Set device explicitly
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
 
16
- # Load LLaMA 3.2 1B model with 4-bit quantization
17
  model_name = "meta-llama/Llama-3.2-1B-Instruct"
18
  tokenizer = AutoTokenizer.from_pretrained(model_name)
19
  model = AutoModelForCausalLM.from_pretrained(
20
  model_name,
21
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
22
- load_in_4bit=True, # Enable 4-bit quantization
23
- device_map="auto" # Automatically map to available device
24
  ).to(device)
25
 
26
  # Differential Privacy parameters
@@ -57,7 +55,7 @@ def build_rag_index(texts):
57
  global embedder, index
58
  try:
59
  embedder = SentenceTransformer('xmanii/maux-gte-persian', device='cpu') # Use CPU to save memory
60
- embeddings = embedder.encode(texts, convert_to_tensor=True, batch_size=16).cpu().numpy() # Smaller batch size
61
  dimension = embeddings.shape[1]
62
  index = faiss.IndexFlatL2(dimension)
63
  index.add(embeddings)
@@ -119,7 +117,7 @@ def chat(message, history):
119
  outputs = model.generate(
120
  input_ids=inputs["input_ids"],
121
  attention_mask=inputs["attention_mask"],
122
- max_length=50,
123
  num_beams=5,
124
  no_repeat_ngram_size=2,
125
  early_stopping=True,
 
4
  import gradio as gr
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
 
7
 
8
  # Disable torch.compile to avoid meta device issues
9
  torch._dynamo.config.suppress_errors = True
 
12
  # Set device explicitly
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
15
+ # Load LLaMA 3.2 1B model (no quantization for CPU compatibility)
16
  model_name = "meta-llama/Llama-3.2-1B-Instruct"
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
  model = AutoModelForCausalLM.from_pretrained(
19
  model_name,
20
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
21
+ device_map="auto" # Automatically map to available device
 
22
  ).to(device)
23
 
24
  # Differential Privacy parameters
 
55
  global embedder, index
56
  try:
57
  embedder = SentenceTransformer('xmanii/maux-gte-persian', device='cpu') # Use CPU to save memory
58
+ embeddings = embedder.encode(texts, convert_to_tensor=True, batch_size=8).cpu().numpy() # Smaller batch size
59
  dimension = embeddings.shape[1]
60
  index = faiss.IndexFlatL2(dimension)
61
  index.add(embeddings)
 
117
  outputs = model.generate(
118
  input_ids=inputs["input_ids"],
119
  attention_mask=inputs["attention_mask"],
120
+ max_length=100, # Increased for better responses
121
  num_beams=5,
122
  no_repeat_ngram_size=2,
123
  early_stopping=True,