sugiv commited on
Commit
83a0536
1 Parent(s): 8a7855a

Adding a simple monkey search for Leetcode - Darn LeetMonkey

Browse files
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -5,7 +5,7 @@ from pinecone_text.sparse import SpladeEncoder
5
  from sentence_transformers import SentenceTransformer
6
  import transformers
7
  transformers.logging.set_verbosity_error()
8
- from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
9
 
10
 
11
  import os
@@ -16,11 +16,6 @@ pc = Pinecone(api_key=PINECONE_API_KEY)
16
  index_name = "leetmonkey-sparse-dense"
17
  index = pc.Index(index_name)
18
 
19
- quantization_config = GPTQConfig(
20
- bits=8,
21
- disable_exllama=True
22
- )
23
-
24
 
25
 
26
  # Initialize models
@@ -29,9 +24,14 @@ splade = SpladeEncoder(device=device)
29
  dense_model = SentenceTransformer('sentence-transformers/all-Mpnet-base-v2', device=device)
30
 
31
  # Load the quantized Llama 2 model and tokenizer
32
- model_name = "TheBloke/Llama-2-7B-Chat-GPTQ"
33
  tokenizer = AutoTokenizer.from_pretrained(model_name)
34
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
 
 
 
 
 
35
 
36
  def search_problems(query, top_k=5):
37
  dense_query = dense_model.encode([query])[0].tolist()
@@ -77,8 +77,7 @@ def generate_response(user_query, top_k=5):
77
  with torch.no_grad():
78
  output = model.generate(
79
  input_ids,
80
- attention_mask=attention_mask,
81
- max_new_tokens=250,
82
  do_sample=True,
83
  top_p=0.9,
84
  temperature=0.7,
 
5
  from sentence_transformers import SentenceTransformer
6
  import transformers
7
  transformers.logging.set_verbosity_error()
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
 
10
 
11
  import os
 
16
  index_name = "leetmonkey-sparse-dense"
17
  index = pc.Index(index_name)
18
 
 
 
 
 
 
19
 
20
 
21
  # Initialize models
 
24
  dense_model = SentenceTransformer('sentence-transformers/all-Mpnet-base-v2', device=device)
25
 
26
  # Load the quantized Llama 2 model and tokenizer
27
+ model_name = "TheBloke/Llama-2-7B-Chat-GGML"
28
  tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)
30
+
31
+
32
+ # Disable Exllama backend if needed
33
+ if hasattr(model, 'quantization_config'):
34
+ model.quantization_config.use_exllama = False
35
 
36
  def search_problems(query, top_k=5):
37
  dense_query = dense_model.encode([query])[0].tolist()
 
77
  with torch.no_grad():
78
  output = model.generate(
79
  input_ids,
80
+ max_new_tokens=100, # Reduce this for faster generation
 
81
  do_sample=True,
82
  top_p=0.9,
83
  temperature=0.7,