th8m0z commited on
Commit
b5ac495
1 Parent(s): 54f6539

more comments

Browse files
__pycache__/app.cpython-311.pyc CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
 
__pycache__/semantic_search.cpython-311.pyc CHANGED
Binary files a/__pycache__/semantic_search.cpython-311.pyc and b/__pycache__/semantic_search.cpython-311.pyc differ
 
__pycache__/ui.cpython-311.pyc CHANGED
Binary files a/__pycache__/ui.cpython-311.pyc and b/__pycache__/ui.cpython-311.pyc differ
 
app.py CHANGED
@@ -35,7 +35,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
35
  doc.close()
36
  return text_list
37
 
38
- # one text converts a list of chunks
39
  def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
40
 
41
  filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
@@ -56,6 +56,7 @@ def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
56
  return chunks
57
 
58
 
 
59
  def load_recommender(paths, start_page=1):
60
  global recommender
61
  texts = []
@@ -66,6 +67,8 @@ def load_recommender(paths, start_page=1):
66
  recommender.fit(chunks)
67
  return 'Corpus Loaded.'
68
 
 
 
69
  def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
70
  openai.api_key = openAI_key
71
  temperature=0.7
@@ -101,6 +104,7 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
101
  return message
102
 
103
 
 
104
  def construct_prompt(question):
105
  topn_chunks = recommender(question)
106
  prompt = 'search results:\n\n'
@@ -114,6 +118,7 @@ def construct_prompt(question):
114
  prompt += f"{question}\nAnswer:"
115
  return prompt
116
 
 
117
  def question_answer(chat_history, url, files, question, openAI_key, model):
118
  try:
119
  if files == None:
@@ -150,6 +155,3 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
150
  return chat_history
151
  except openai.error.InvalidRequestError as e:
152
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
153
-
154
-
155
-
 
35
  doc.close()
36
  return text_list
37
 
38
+ # converts a text into a list of chunks
39
  def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
40
 
41
  filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
 
56
  return chunks
57
 
58
 
59
+ # merges a list of pdfs into a list of chunks and fits the recommender
60
  def load_recommender(paths, start_page=1):
61
  global recommender
62
  texts = []
 
67
  recommender.fit(chunks)
68
  return 'Corpus Loaded.'
69
 
70
+
71
+ # calls the OpenAI API to generate a response for the given query
72
  def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
73
  openai.api_key = openAI_key
74
  temperature=0.7
 
104
  return message
105
 
106
 
107
+ # constructs the prompt for the given query
108
  def construct_prompt(question):
109
  topn_chunks = recommender(question)
110
  prompt = 'search results:\n\n'
 
118
  prompt += f"{question}\nAnswer:"
119
  return prompt
120
 
121
+ # main function that is called when the user clicks the submit button, generates an answer for the query
122
  def question_answer(chat_history, url, files, question, openAI_key, model):
123
  try:
124
  if files == None:
 
155
  return chat_history
156
  except openai.error.InvalidRequestError as e:
157
  return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
 
 
 
semantic_search.py CHANGED
@@ -2,13 +2,14 @@ import numpy as np
2
  import tensorflow_hub as hub
3
  from sklearn.neighbors import NearestNeighbors
4
 
 
 
5
  class SemanticSearch:
6
-
7
  def __init__(self):
8
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
9
  self.fitted = False
10
 
11
-
12
  def fit(self, data, batch=1000, n_neighbors=5):
13
  self.data = data
14
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -16,7 +17,7 @@ class SemanticSearch:
16
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
17
  self.nn.fit(self.embeddings)
18
  self.fitted = True
19
-
20
 
21
  def __call__(self, text, return_data=True):
22
  inp_emb = self.use([text])
@@ -28,6 +29,7 @@ class SemanticSearch:
28
  return neighbors
29
 
30
 
 
31
  def get_text_embedding(self, texts, batch=1000):
32
  embeddings = []
33
  for i in range(0, len(texts), batch):
 
2
  import tensorflow_hub as hub
3
  from sklearn.neighbors import NearestNeighbors
4
 
5
+
6
+
7
  class SemanticSearch:
 
8
  def __init__(self):
9
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
10
  self.fitted = False
11
 
12
+ # fits the recommender
13
  def fit(self, data, batch=1000, n_neighbors=5):
14
  self.data = data
15
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
17
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
18
  self.nn.fit(self.embeddings)
19
  self.fitted = True
20
+
21
 
22
  def __call__(self, text, return_data=True):
23
  inp_emb = self.use([text])
 
29
  return neighbors
30
 
31
 
32
+ # returns embeddings
33
  def get_text_embedding(self, texts, batch=1000):
34
  embeddings = []
35
  for i in range(0, len(texts), batch):
ui.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
  import app as app
3
 
4
-
5
-
6
  # pre-defined questions
7
  questions = [
8
  "What did the study investigate?",
 
1
  import gradio as gr
2
  import app as app
3
 
 
 
4
  # pre-defined questions
5
  questions = [
6
  "What did the study investigate?",