paulokewunmi commited on
Commit
31dada8
1 Parent(s): 9d5744a

Add paraphrase feature

Browse files
.gitignore CHANGED
@@ -1,2 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  .DS_Store
2
- __pycache__
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # JetBrains IDE project settings
132
+ .idea/
133
+
134
+ # OS generated files
135
  .DS_Store
136
+ .DS_Store?
137
+ ._*
138
+ .Spotlight-V100
139
+ .Trashes
140
+ Icon?
141
+ ehthumbs.db
142
+ Thumbs.db
143
+
144
+ # secrets
145
+ .env
146
+ .vscode/
__pycache__/app.cpython-311.pyc DELETED
Binary file (17.1 kB)
 
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # coding: utf-8
2
-
3
  import gradio as gr
4
  from src.document_utils import (
5
  summarize,
@@ -7,6 +5,7 @@ from src.document_utils import (
7
  generate_questions,
8
  load_history,
9
  load_science,
 
10
  )
11
  from src.wiki_search import cross_lingual_document_search, translate_text
12
  from src.theme import CustomTheme
@@ -27,13 +26,6 @@ def study_doc_qa_bot(input_document, history):
27
  bot_message = question_answer(input_document, history)
28
  history[-1][1] = bot_message
29
  return history
30
-
31
- # def translate_text(doc):
32
- # translator = EasyGoogleTranslate()
33
-
34
- # doc = " ".join(doc.split()[:4800])
35
- # result = translator.translate(doc, target_language='en')
36
- # return result
37
 
38
 
39
  custom_theme = CustomTheme()
@@ -50,7 +42,7 @@ with gr.Blocks(theme=custom_theme) as demo:
50
 
51
  with gr.TabItem("Document Search"):
52
  gr.HTML(
53
- """<p style="text-align:center;font-size:24px;"><b>Search across a set of study materials in your own native language or even a mix of languages.</p>"""
54
  )
55
  gr.HTML(
56
  """<p style="text-align:center; font-style:italic; font-size:16px;">Get started with a pre-indexed set of study materials spaning various subjects (History, Literature, Philosophy, Government etc) in 4 different languages.</p>"""
@@ -98,12 +90,7 @@ with gr.Blocks(theme=custom_theme) as demo:
98
  )
99
 
100
  with gr.Column():
101
- with gr.Accordion("Click to View Source", open=False):
102
-
103
- source_res_1 = gr.Textbox(
104
- label=f"Source Url",
105
-
106
- )
107
  translate_btn_1 = gr.Button(
108
  label="Translate Text",
109
  value="Translate Text",
@@ -113,16 +100,17 @@ with gr.Blocks(theme=custom_theme) as demo:
113
  label=f"Translation in English",
114
  )
115
 
 
 
 
 
116
  with gr.Row():
117
  with gr.Column():
118
  query_match_out_2 = gr.Textbox(label=f"Search Result 2")
119
 
120
  with gr.Column():
121
- with gr.Accordion("Click to View Source", open=False):
122
- source_res_2 = gr.Textbox(
123
- label=f"Source Url"
124
- )
125
-
126
  translate_btn_2 = gr.Button(
127
  label="Translate Text",
128
  value="Translate Text",
@@ -133,15 +121,18 @@ with gr.Blocks(theme=custom_theme) as demo:
133
 
134
  )
135
 
 
 
 
 
 
136
  with gr.Row():
137
  with gr.Column():
138
  query_match_out_3 = gr.Textbox(label=f"Search Result 3")
139
 
140
  with gr.Column():
141
- with gr.Accordion("Click to View Source", open=False):
142
- source_res_3 = gr.Textbox(
143
- label=f"Source Url"
144
- )
145
  translate_btn_3 = gr.Button(
146
  label="Translate Text",
147
  value="Translate Text",
@@ -150,6 +141,9 @@ with gr.Blocks(theme=custom_theme) as demo:
150
  translate_res_3= gr.Textbox(
151
  label=f"Translation in English",
152
  )
 
 
 
153
 
154
  with gr.TabItem("Q&A"):
155
  gr.HTML(
@@ -179,8 +173,6 @@ with gr.Blocks(theme=custom_theme) as demo:
179
  )
180
  clear = gr.Button("Clear", variant="primary")
181
 
182
-
183
-
184
  with gr.TabItem("Summarize"):
185
  gr.HTML(
186
  """<p style="text-align:center; font-size:24px;"><b> Get the most out of your study materials!</p>"""
@@ -246,7 +238,50 @@ with gr.Blocks(theme=custom_theme) as demo:
246
  with gr.Row():
247
  generate_output = gr.Text(label="Generated questions", lines=5)
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  # fetch answer for submitted question corresponding to input document
251
  input_question.submit(
252
  get_user_input,
@@ -303,17 +338,16 @@ with gr.Blocks(theme=custom_theme) as demo:
303
  queue=False,
304
  )
305
 
 
 
 
 
 
 
 
306
  # clear the chatbot Q&A history when this button is clicked by the user
307
  clear.click(lambda: None, None, chatbot, queue=False)
308
 
309
- # # run search as user is typing the query
310
- # user_query.change(
311
- # cross_lingual_document_search,
312
- # [user_query, num_search_results, lang_choices, text_match],
313
- # [query_match_out_1, query_match_out_2, query_match_out_3],
314
- # queue=False,
315
- # )
316
-
317
  # run search if user submits query
318
  user_query.submit(
319
  cross_lingual_document_search,
 
 
 
1
  import gradio as gr
2
  from src.document_utils import (
3
  summarize,
 
5
  generate_questions,
6
  load_history,
7
  load_science,
8
+ paraphrase
9
  )
10
  from src.wiki_search import cross_lingual_document_search, translate_text
11
  from src.theme import CustomTheme
 
26
  bot_message = question_answer(input_document, history)
27
  history[-1][1] = bot_message
28
  return history
 
 
 
 
 
 
 
29
 
30
 
31
  custom_theme = CustomTheme()
 
42
 
43
  with gr.TabItem("Document Search"):
44
  gr.HTML(
45
+ """<p style="text-align:center;font-size:24px;"><b>Search across a library of study materials in your own native language or even a mix of languages.</p>"""
46
  )
47
  gr.HTML(
48
  """<p style="text-align:center; font-style:italic; font-size:16px;">Get started with a pre-indexed set of study materials spaning various subjects (History, Literature, Philosophy, Government etc) in 4 different languages.</p>"""
 
90
  )
91
 
92
  with gr.Column():
93
+ with gr.Accordion("Click to View Translation/Source", open=False):
 
 
 
 
 
94
  translate_btn_1 = gr.Button(
95
  label="Translate Text",
96
  value="Translate Text",
 
100
  label=f"Translation in English",
101
  )
102
 
103
+ source_res_1 = gr.Textbox(
104
+ label=f"Source Url",
105
+ )
106
+
107
  with gr.Row():
108
  with gr.Column():
109
  query_match_out_2 = gr.Textbox(label=f"Search Result 2")
110
 
111
  with gr.Column():
112
+ with gr.Accordion("Click to View Translation/Source", open=False):
113
+
 
 
 
114
  translate_btn_2 = gr.Button(
115
  label="Translate Text",
116
  value="Translate Text",
 
121
 
122
  )
123
 
124
+ source_res_2 = gr.Textbox(
125
+ label=f"Source Url"
126
+ )
127
+
128
+
129
  with gr.Row():
130
  with gr.Column():
131
  query_match_out_3 = gr.Textbox(label=f"Search Result 3")
132
 
133
  with gr.Column():
134
+ with gr.Accordion("Click to View Translation/Source", open=False):
135
+
 
 
136
  translate_btn_3 = gr.Button(
137
  label="Translate Text",
138
  value="Translate Text",
 
141
  translate_res_3= gr.Textbox(
142
  label=f"Translation in English",
143
  )
144
+ source_res_3 = gr.Textbox(
145
+ label=f"Source Url"
146
+ )
147
 
148
  with gr.TabItem("Q&A"):
149
  gr.HTML(
 
173
  )
174
  clear = gr.Button("Clear", variant="primary")
175
 
 
 
176
  with gr.TabItem("Summarize"):
177
  gr.HTML(
178
  """<p style="text-align:center; font-size:24px;"><b> Get the most out of your study materials!</p>"""
 
238
  with gr.Row():
239
  generate_output = gr.Text(label="Generated questions", lines=5)
240
 
241
+ with gr.TabItem("Paraphrase"):
242
+ gr.HTML(
243
+ """<p style="text-align:center;"><b>Paraphraser. Add your document below and generate a rephrase for it.</p>"""
244
+ )
245
+
246
+ with gr.Row():
247
+ with gr.Column():
248
+ paraphrase_input = gr.Text(label="Document", lines=10)
249
+ generate_paraphrase = gr.Button("Paraphrase", variant="primary")
250
+
251
+ with gr.Column():
252
+ paraphrase_output = gr.HTML(label="Paraphrase", lines=10)
253
+ invisible_comp = gr.Text(label="Dummy Component", visible=False)
254
 
255
+ with gr.Row():
256
+ with gr.Accordion("Advanced Settings:", open=False):
257
+ paraphrase_length = gr.Radio(
258
+ ["short", "medium", "long"],
259
+ label="Paraphrase Length",
260
+ value="long",
261
+ )
262
+ paraphrase_format = gr.Radio(
263
+ ["paragraph", "bullets"],
264
+ label="Paraphrase Format",
265
+ value="bullets",
266
+ )
267
+ extractiveness = gr.Radio(
268
+ ["low", "medium", "high"],
269
+ label="Extractiveness",
270
+ info="Controls how close to the original text the paraphrase is.",
271
+ visible=False,
272
+ value="high",
273
+ )
274
+ temperature = gr.Slider(
275
+ minimum=0,
276
+ maximum=5.0,
277
+ value=0.64,
278
+ step=0.1,
279
+ interactive=True,
280
+ visible=False,
281
+ label="Temperature",
282
+ info="Controls the randomness of the output. Lower values tend to generate more “predictable” output, while higher values tend to generate more “creative” output.",
283
+ )
284
+
285
  # fetch answer for submitted question corresponding to input document
286
  input_question.submit(
287
  get_user_input,
 
338
  queue=False,
339
  )
340
 
341
+ generate_paraphrase.click(
342
+ paraphrase,
343
+ [paraphrase_input],
344
+ [paraphrase_output],
345
+ queue=False,
346
+ )
347
+
348
  # clear the chatbot Q&A history when this button is clicked by the user
349
  clear.click(lambda: None, None, chatbot, queue=False)
350
 
 
 
 
 
 
 
 
 
351
  # run search if user submits query
352
  user_query.submit(
353
  cross_lingual_document_search,
src/document_utils.py CHANGED
@@ -4,6 +4,7 @@ import sys
4
  import pandas as pd
5
  from typing import List
6
  import pinecone
 
7
 
8
  import cohere
9
  from langchain.embeddings.cohere import CohereEmbeddings
@@ -17,6 +18,7 @@ sys.path.append(os.path.abspath('..'))
17
  from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
18
 
19
 
 
20
  PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
21
  PINECONE_ENV = os.environ.get("PINECONE_ENV")
22
  COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
@@ -120,14 +122,40 @@ def question_answer(input_document: str, history: List) -> str:
120
  return answer
121
 
122
  def generate_questions(input_document: str) -> str:
123
- generated_response = cohere.Client(COHERE_API_KEY).generate(
124
- prompt = f"Give me 5 different questions to test understanding of the following text provided. Here's the provided text: {input_document}. Now what is Questions 1 to 5 ?:",
125
- max_tokens = 200,
126
- temperature = 0.55
127
- )
128
- # prompt = f"Generate 5 different quiz questions to test the understanding of the following text. Here's the provided text: {input_document}. Whats Questions 1 to 5 of the quiz ?:"
129
- # print(prompt)
130
- return generated_response.generations[0].text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
 
133
  def load_science():
@@ -143,6 +171,51 @@ def load_history():
143
  sample_question = examples_df["question"].iloc[1]
144
  return history_doc, sample_question
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  if __name__ == "__main__":
148
  with open('sample_text.txt', 'r') as file:
 
4
  import pandas as pd
5
  from typing import List
6
  import pinecone
7
+ import difflib
8
 
9
  import cohere
10
  from langchain.embeddings.cohere import CohereEmbeddings
 
18
  from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
19
 
20
 
21
+
22
  PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
23
  PINECONE_ENV = os.environ.get("PINECONE_ENV")
24
  COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
 
122
  return answer
123
 
124
  def generate_questions(input_document: str) -> str:
125
+ co = cohere.Client(COHERE_API_KEY)
126
+ prompt = f"""Write five different questions to test the understanding of the following text. The questions should be short answer, with one or two words each, and vary in difficulty from easy to hard. Provide the correct answer for each question after the question.
127
+ Now write your own questions for this text:
128
+
129
+ Text: {input_document}
130
+
131
+ Question 1: (question_1)
132
+ Answer: (answer_1)
133
+
134
+ Question 2: (question_2)
135
+ Answer: (answer_2)
136
+
137
+ Question 3: (question_3)
138
+ Answer: (answer_3)
139
+
140
+ Question 4: (question_4)
141
+ Answer: (answer_4)
142
+
143
+ Question 5: (question_5)
144
+ Answer: (answer_5)"""
145
+
146
+
147
+ response = co.generate(model='command', prompt=prompt, temperature=2, max_tokens=1000, )
148
+
149
+ answer = response.generations[0].text.strip()
150
+ print(answer)
151
+ questions = answer.split('\n\n')
152
+ print(questions)
153
+ result = {}
154
+ for question in questions:
155
+ q, a = question.split('\n')
156
+ result[q] = a.split(': ')[1]
157
+
158
+ return answer
159
 
160
 
161
  def load_science():
 
171
  sample_question = examples_df["question"].iloc[1]
172
  return history_doc, sample_question
173
 
174
+ def show_diff_html(seqm):
175
+ """Unify operations between two compared strings
176
+ seqm is a difflib.SequenceMatcher instance whose a & b are strings
177
+ """
178
+ output = []
179
+ for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
180
+ if opcode == 'equal':
181
+ output.append(seqm.b[b0:b1])
182
+ elif opcode == 'insert':
183
+ output.append(f"<span style='background-color:lime;'>{seqm.b[b0:b1]}</span>")
184
+ # elif opcode == 'delete':
185
+ # output.append(f"<span style='background-color:red;'>{seqm.a[a0:a1]}</span>")
186
+ elif opcode == 'replace':
187
+ # output.append(f"<span style='background-color:red;'>{seqm.a[a0:a1]}</span>")
188
+ output.append(f"<span style='background-color:lime;'>{seqm.b[b0:b1]}</span>")
189
+ else:
190
+ if opcode == 'delete' or opcode == 'replace':
191
+ continue
192
+ raise RuntimeError("unexpected opcode")
193
+ return ''.join(output)
194
+
195
+ # define a function to paraphrase text using Cohere API
196
+ def paraphrase(text):
197
+ # create a cohere client with your API key
198
+ client = cohere.Client(api_key=COHERE_API_KEY)
199
+
200
+ # set the prompt for paraphrasing
201
+ prompt = f"Rephrase this sentence in a different way: {text}"
202
+
203
+ # generate a response using the multilingual-22-12 model
204
+ response = client.generate(
205
+ model="command-nightly",
206
+ prompt=prompt,
207
+ max_tokens=1000,
208
+
209
+ )
210
+ # get the generated text
211
+ rephrased_text = response[0].text
212
+ print(rephrased_text)
213
+
214
+ # compare the original and rephrased texts using difflib
215
+ sm = difflib.SequenceMatcher(None, text, rephrased_text)
216
+ html = show_diff_html(sm)
217
+
218
+ return html
219
 
220
  if __name__ == "__main__":
221
  with open('sample_text.txt', 'r') as file:
src/document_utils_v2.py DELETED
@@ -1,151 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import pandas as pd
5
- from typing import List
6
-
7
- import cohere
8
- from langchain.embeddings.cohere import CohereEmbeddings
9
- from langchain.llms import Cohere
10
- from langchain.prompts import PromptTemplate
11
- from langchain.vectorstores import Qdrant
12
- from langchain.chains.question_answering import load_qa_chain
13
-
14
- sys.path.append(os.path.abspath('..'))
15
-
16
- from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
17
-
18
-
19
-
20
- QDRANT_HOST = os.environ.get("QDRANT_HOST")
21
- QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
22
- COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
23
-
24
-
25
- def replace_text(text):
26
- if text.startswith("The answer is "):
27
- text = text.replace("The answer is ", "", 1)
28
- return text
29
-
30
-
31
- def summarize(
32
- document: str,
33
- summary_length: str,
34
- summary_format: str,
35
- extractiveness: str = "high",
36
- temperature: float = 0.6,
37
- ) -> str:
38
- """
39
- Generates a summary for the input document using Cohere's summarize API.
40
- Args:
41
- document (`str`):
42
- The document given by the user for which summary must be generated.
43
- summary_length (`str`):
44
- A value such as 'short', 'medium', 'long' indicating the length of the summary.
45
- summary_format (`str`):
46
- This indicates whether the generated summary should be in 'paragraph' format or 'bullets'.
47
- extractiveness (`str`, *optional*, defaults to 'high'):
48
- A value such as 'low', 'medium', 'high' indicating how close the generated summary should be in meaning to the original text.
49
- temperature (`str`):
50
- This controls the randomness of the output. Lower values tend to generate more “predictable” output, while higher values tend to generate more “creative” output.
51
- Returns:
52
- generated_summary (`str`):
53
- The generated summary from the summarization model.
54
- """
55
-
56
- summary_response = cohere.Client(COHERE_API_KEY).summarize(
57
- text=document,
58
- length=summary_length,
59
- format=summary_format,
60
- model=SUMMARIZATION_MODEL,
61
- extractiveness=extractiveness,
62
- temperature=temperature,
63
- )
64
- generated_summary = summary_response.summary
65
- return generated_summary
66
-
67
-
68
- def question_answer(input_document: str, history: List) -> str:
69
- """
70
- Generates an appropriate answer for the question asked by the user based on the input document.
71
- Args:
72
- input_document (`str`):
73
- The document given by the user for which summary must be generated.
74
- history (`List[List[str,str]]`):
75
- A list made up of pairs of input question asked by the user & corresponding generated answers. It is used to keep track of the history of the chat between the user and the model.
76
- Returns:
77
- answer (`str`):
78
- The generated answer corresponding to the input question and document received from the user.
79
- """
80
- context = input_document
81
- # The last element of the `history` list contains the most recent question asked by the user whose answer needs to be generated.
82
- question = history[-1][0]
83
- word_list = context.split()
84
- # texts = [context[k : k + 256] for k in range(0, len(context.split()), 256)]
85
- texts = [" ".join(word_list[k : k + 256]) for k in range(0, len(word_list), 256)]
86
-
87
- # print(texts)
88
-
89
- embeddings = CohereEmbeddings(
90
- model="multilingual-22-12", cohere_api_key=COHERE_API_KEY
91
- )
92
- context_index = Qdrant.from_texts(
93
- texts, embeddings, url=QDRANT_HOST, api_key=QDRANT_API_KEY
94
- )
95
-
96
- prompt_template = """Text: {context}
97
- Question: {question}
98
- Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available."""
99
-
100
- PROMPT = PromptTemplate(
101
- template=prompt_template, input_variables=["context", "question"]
102
- )
103
-
104
- # Generate the answer given the context
105
- chain = load_qa_chain(
106
- Cohere(
107
- model="command-xlarge-nightly", temperature=0, cohere_api_key=COHERE_API_KEY
108
- ),
109
- chain_type="stuff",
110
- prompt=PROMPT,
111
- )
112
- relevant_context = context_index.similarity_search(question)
113
- answer = chain.run(input_documents=relevant_context, question=question)
114
- answer = answer.replace("\n", "").replace("Answer:", "")
115
- answer = replace_text(answer)
116
- return answer
117
-
118
- def generate_questions(input_document: str) -> str:
119
- generated_response = cohere.Client(COHERE_API_KEY).generate(
120
- prompt = f"Give me 5 different questions to test understanding of the following text provided. Here's the provided text: {input_document}. Now what is Questions 1 to 5 ?:",
121
- max_tokens = 200,
122
- temperature = 0.55
123
- )
124
- # prompt = f"Generate 5 different quiz questions to test the understanding of the following text. Here's the provided text: {input_document}. Whats Questions 1 to 5 of the quiz ?:"
125
- # print(prompt)
126
- return generated_response.generations[0].text
127
-
128
-
129
- def load_science():
130
- examples_df = pd.read_csv(EXAMPLES_FILE_PATH)
131
- science_doc = examples_df["doc"].iloc[0]
132
- sample_question = examples_df["question"].iloc[0]
133
- return science_doc, sample_question
134
-
135
-
136
- def load_history():
137
- examples_df = pd.read_csv(EXAMPLES_FILE_PATH)
138
- history_doc = examples_df["doc"].iloc[1]
139
- sample_question = examples_df["question"].iloc[1]
140
- return history_doc, sample_question
141
-
142
-
143
- if __name__ == "__main__":
144
- with open('sample_text.txt', 'r') as file:
145
- text = file.read()
146
- # summary = summarize(text, summary_length="short", summary_format="bullets")
147
- # print(summary)
148
- # answer = question_answer(text, [["what is photosynthesis", None]])
149
- # print(answer)
150
- question = question_answer(text, ["Whats photosynthesis"])
151
- print(question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/wiki_search.py CHANGED
@@ -5,7 +5,6 @@ import pinecone
5
  from easygoogletranslate import EasyGoogleTranslate
6
 
7
 
8
- # load environment variables
9
  PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
10
  PINECONE_ENV = os.environ.get("PINECONE_ENV")
11
  COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
@@ -87,7 +86,9 @@ def cross_lingual_document_search(
87
  )
88
 
89
  results = [result['title']+"\n"+result['text'] for result in metadata]
 
90
 
 
91
  url_list = [result['url'] + "\n\n" for result in metadata]
92
 
93
  return results + url_list
@@ -113,12 +114,14 @@ def document_source(
113
 
114
  return results
115
 
116
-
117
  def translate_text(doc):
118
  doc = " ".join(doc.split()[:4800])
119
  result = translator.translate(doc, target_language='en')
120
  return result
121
 
 
 
 
122
  if __name__ == "__main__":
123
  # query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
124
  # result = search_wiki_for_query(query_embedding,user_query=user_query)
 
5
  from easygoogletranslate import EasyGoogleTranslate
6
 
7
 
 
8
  PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
9
  PINECONE_ENV = os.environ.get("PINECONE_ENV")
10
  COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
 
86
  )
87
 
88
  results = [result['title']+"\n"+result['text'] for result in metadata]
89
+ url_list = [result['url'] + "\n\n" for result in metadata]
90
 
91
+ return results + url_list
92
  url_list = [result['url'] + "\n\n" for result in metadata]
93
 
94
  return results + url_list
 
114
 
115
  return results
116
 
 
117
  def translate_text(doc):
118
  doc = " ".join(doc.split()[:4800])
119
  result = translator.translate(doc, target_language='en')
120
  return result
121
 
122
+ def translate_search_result():
123
+ pass
124
+
125
  if __name__ == "__main__":
126
  # query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
127
  # result = search_wiki_for_query(query_embedding,user_query=user_query)
src/wiki_search_v2.py DELETED
@@ -1,162 +0,0 @@
1
- import os
2
- import cohere
3
- from typing import List
4
-
5
- from qdrant_client import QdrantClient
6
- from qdrant_client import models
7
-
8
-
9
- # load environment variables
10
- QDRANT_HOST = os.environ.get("QDRANT_HOST")
11
- QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
12
- COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
13
-
14
- MODEL_NAME = "multilingual-22-12"
15
- COLLECTION = "wiki-embed"
16
-
17
- # create qdrant and cohere client
18
- cohere_client = cohere.Client(COHERE_API_KEY)
19
-
20
- qdrant_client = QdrantClient(
21
- host=QDRANT_HOST,
22
- api_key=QDRANT_API_KEY,
23
- port = 443,
24
- )
25
-
26
- def embed_user_query(user_query):
27
-
28
- embeddings = cohere_client.embed(
29
- texts=[user_query],
30
- model=MODEL_NAME,
31
- )
32
- query_embedding = embeddings.embeddings[0]
33
- return query_embedding, user_query
34
-
35
-
36
- def search_wiki_for_query(
37
- query_embedding,
38
- num_results = 3,
39
- user_query= "",
40
- languages = [],
41
- match_text = None,
42
- ):
43
- filters = []
44
-
45
- language_mapping = {
46
- "English": "en",
47
- "Yoruba": "yo",
48
- "Igbo": "ig",
49
- "Hause": "ha",
50
- }
51
-
52
- # prepare filters to narrow down search results
53
- # if the `match_text` list is not empty then create filter to find exact matching text in the documents
54
- if match_text:
55
- filters.append(
56
- models.FieldCondition(
57
- key="text",
58
- match=models.MatchText(text=user_query),
59
- )
60
- )
61
-
62
- # filter documents based on language before performing search:
63
- if languages:
64
- for lang in languages:
65
- filters.append(
66
- models.FieldCondition(
67
- key="lang",
68
- match=models.MatchValue(
69
- value=language_mapping[lang],
70
- ),
71
- )
72
- )
73
-
74
- # perform search and get results
75
- results = qdrant_client.search(
76
- collection_name=COLLECTION,
77
- query_filter=models.Filter(should=filters),
78
- search_params=models.SearchParams(hnsw_ef=128, exact=False),
79
- query_vector=query_embedding,
80
- limit=num_results,
81
- )
82
- return results
83
-
84
-
85
- def cross_lingual_document_search(
86
- user_input: str, num_results: int, languages, text_match
87
- ) -> List:
88
- """
89
- Wrapper function for performing search on the collection of documents for the given user query.
90
- Prepares query embedding, retrieves search results, checks if expected number of search results are being returned.
91
- Args:
92
- user_input (`str`):
93
- The user input based on which search will be performed.
94
- num_results (`str`):
95
- The number of expected search results.
96
- languages (`str`):
97
- The list of languages based on which search results must be filtered.
98
- text_match (`str`):
99
- A field based on which it is decided whether to perform full-text-match while performing search.
100
- Returns:
101
- final_results (`List[str]`):
102
- A list containing the final search results corresponding to the given user input.
103
- """
104
- # create an embedding for the input query
105
- query_embedding, _ = embed_user_query(user_input)
106
-
107
- # retrieve search results
108
- result = search_wiki_for_query(
109
- query_embedding,
110
- num_results,
111
- user_input,
112
- languages,
113
- text_match,
114
- )
115
- final_results = [result[i].payload["text"] for i in range(len(result))]
116
-
117
- # check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
118
- if num_results > len(final_results):
119
- remaining_inputs = num_results - len(final_results)
120
- for input in range(remaining_inputs):
121
- final_results.append("")
122
-
123
- return final_results
124
-
125
- def document_source(
126
- user_input: str, num_results: int, languages, text_match
127
- ) -> List:
128
- query_embedding, _ = embed_user_query(user_input)
129
-
130
- # retrieve search results
131
- result = search_wiki_for_query(
132
- query_embedding,
133
- num_results,
134
- user_input,
135
- languages,
136
- text_match,
137
- )
138
- sources = [result[i].payload["url"] for i in range(len(result))]
139
-
140
- # check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
141
- if num_results > len(sources):
142
- remaining_inputs = num_results - len(sources)
143
- for input in range(remaining_inputs):
144
- sources.append("")
145
-
146
- return sources
147
-
148
-
149
- def translate_search_result():
150
- pass
151
-
152
- if __name__ == "__main__":
153
- # query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
154
- # result = search_wiki_for_query(query_embedding,user_query=user_query)
155
-
156
- # for item in result:
157
- # print(item.payload["url"])
158
- result = cross_lingual_document_search("Who is the president of Nigeria",
159
- num_results=3,
160
- languages=["Yoruba"],
161
- text_match=False)
162
- print(result, len(result))