sugiv commited on
Commit
506d5f3
1 Parent(s): 83a0536

Adding a simple monkey search for Leetcode - Darn LeetMonkey

Browse files
Files changed (1) hide show
  1. app.py +8 -18
app.py CHANGED
@@ -1,38 +1,27 @@
1
  import gradio as gr
2
- from pinecone import Pinecone, ServerlessSpec
3
  import torch
4
  from pinecone_text.sparse import SpladeEncoder
5
  from sentence_transformers import SentenceTransformer
6
- import transformers
7
- transformers.logging.set_verbosity_error()
8
  from transformers import AutoTokenizer, AutoModelForCausalLM
9
-
10
-
11
  import os
12
- PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
13
 
 
 
14
  pc = Pinecone(api_key=PINECONE_API_KEY)
15
-
16
  index_name = "leetmonkey-sparse-dense"
17
  index = pc.Index(index_name)
18
 
19
-
20
-
21
  # Initialize models
22
  device = 'cpu'
23
  splade = SpladeEncoder(device=device)
24
  dense_model = SentenceTransformer('sentence-transformers/all-Mpnet-base-v2', device=device)
25
 
26
  # Load the quantized Llama 2 model and tokenizer
27
- model_name = "TheBloke/Llama-2-7B-Chat-GGML"
28
  tokenizer = AutoTokenizer.from_pretrained(model_name)
29
  model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)
30
 
31
-
32
- # Disable Exllama backend if needed
33
- if hasattr(model, 'quantization_config'):
34
- model.quantization_config.use_exllama = False
35
-
36
  def search_problems(query, top_k=5):
37
  dense_query = dense_model.encode([query])[0].tolist()
38
  sparse_query = splade.encode_documents([query])[0]
@@ -71,13 +60,14 @@ def generate_response(user_query, top_k=5):
71
  user_prompt = f"Based on the following query, recommend relevant LeetCode problems:\n{user_query}"
72
  full_prompt = f"{system_prompt}\n\n{few_shot_prompt}\n{user_prompt}\n\nRecommendations:"
73
 
74
- input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to(model.device)
75
  attention_mask = torch.ones_like(input_ids)
76
 
77
  with torch.no_grad():
78
  output = model.generate(
79
  input_ids,
80
- max_new_tokens=100, # Reduce this for faster generation
 
81
  do_sample=True,
82
  top_p=0.9,
83
  temperature=0.7,
@@ -99,4 +89,4 @@ iface = gr.Interface(
99
  )
100
 
101
  # Launch the app
102
- iface.launch(share=True)
 
1
  import gradio as gr
2
+ from pinecone import Pinecone
3
  import torch
4
  from pinecone_text.sparse import SpladeEncoder
5
  from sentence_transformers import SentenceTransformer
 
 
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
7
  import os
 
8
 
9
+ # Initialize Pinecone
10
+ PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
11
  pc = Pinecone(api_key=PINECONE_API_KEY)
 
12
  index_name = "leetmonkey-sparse-dense"
13
  index = pc.Index(index_name)
14
 
 
 
15
  # Initialize models
16
  device = 'cpu'
17
  splade = SpladeEncoder(device=device)
18
  dense_model = SentenceTransformer('sentence-transformers/all-Mpnet-base-v2', device=device)
19
 
20
  # Load the quantized Llama 2 model and tokenizer
21
+ model_name = "distilgpt2" # Using distilgpt2 for CPU efficiency
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
  model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)
24
 
 
 
 
 
 
25
  def search_problems(query, top_k=5):
26
  dense_query = dense_model.encode([query])[0].tolist()
27
  sparse_query = splade.encode_documents([query])[0]
 
60
  user_prompt = f"Based on the following query, recommend relevant LeetCode problems:\n{user_query}"
61
  full_prompt = f"{system_prompt}\n\n{few_shot_prompt}\n{user_prompt}\n\nRecommendations:"
62
 
63
+ input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to(device)
64
  attention_mask = torch.ones_like(input_ids)
65
 
66
  with torch.no_grad():
67
  output = model.generate(
68
  input_ids,
69
+ attention_mask=attention_mask,
70
+ max_new_tokens=100, # Adjust as needed
71
  do_sample=True,
72
  top_p=0.9,
73
  temperature=0.7,
 
89
  )
90
 
91
  # Launch the app
92
+ iface.launch(share=True)