Spaces:
Runtime error
Runtime error
Commit
•
31dada8
1
Parent(s):
9d5744a
Add paraphrase feature
Browse files- .gitignore +145 -1
- __pycache__/app.cpython-311.pyc +0 -0
- app.py +69 -35
- src/document_utils.py +81 -8
- src/document_utils_v2.py +0 -151
- src/wiki_search.py +5 -2
- src/wiki_search_v2.py +0 -162
.gitignore
CHANGED
@@ -1,2 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
.DS_Store
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
# JetBrains IDE project settings
|
132 |
+
.idea/
|
133 |
+
|
134 |
+
# OS generated files
|
135 |
.DS_Store
|
136 |
+
.DS_Store?
|
137 |
+
._*
|
138 |
+
.Spotlight-V100
|
139 |
+
.Trashes
|
140 |
+
Icon?
|
141 |
+
ehthumbs.db
|
142 |
+
Thumbs.db
|
143 |
+
|
144 |
+
# secrets
|
145 |
+
.env
|
146 |
+
.vscode/
|
__pycache__/app.cpython-311.pyc
DELETED
Binary file (17.1 kB)
|
|
app.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
# coding: utf-8
|
2 |
-
|
3 |
import gradio as gr
|
4 |
from src.document_utils import (
|
5 |
summarize,
|
@@ -7,6 +5,7 @@ from src.document_utils import (
|
|
7 |
generate_questions,
|
8 |
load_history,
|
9 |
load_science,
|
|
|
10 |
)
|
11 |
from src.wiki_search import cross_lingual_document_search, translate_text
|
12 |
from src.theme import CustomTheme
|
@@ -27,13 +26,6 @@ def study_doc_qa_bot(input_document, history):
|
|
27 |
bot_message = question_answer(input_document, history)
|
28 |
history[-1][1] = bot_message
|
29 |
return history
|
30 |
-
|
31 |
-
# def translate_text(doc):
|
32 |
-
# translator = EasyGoogleTranslate()
|
33 |
-
|
34 |
-
# doc = " ".join(doc.split()[:4800])
|
35 |
-
# result = translator.translate(doc, target_language='en')
|
36 |
-
# return result
|
37 |
|
38 |
|
39 |
custom_theme = CustomTheme()
|
@@ -50,7 +42,7 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
50 |
|
51 |
with gr.TabItem("Document Search"):
|
52 |
gr.HTML(
|
53 |
-
"""<p style="text-align:center;font-size:24px;"><b>Search across a
|
54 |
)
|
55 |
gr.HTML(
|
56 |
"""<p style="text-align:center; font-style:italic; font-size:16px;">Get started with a pre-indexed set of study materials spaning various subjects (History, Literature, Philosophy, Government etc) in 4 different languages.</p>"""
|
@@ -98,12 +90,7 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
98 |
)
|
99 |
|
100 |
with gr.Column():
|
101 |
-
with gr.Accordion("Click to View Source", open=False):
|
102 |
-
|
103 |
-
source_res_1 = gr.Textbox(
|
104 |
-
label=f"Source Url",
|
105 |
-
|
106 |
-
)
|
107 |
translate_btn_1 = gr.Button(
|
108 |
label="Translate Text",
|
109 |
value="Translate Text",
|
@@ -113,16 +100,17 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
113 |
label=f"Translation in English",
|
114 |
)
|
115 |
|
|
|
|
|
|
|
|
|
116 |
with gr.Row():
|
117 |
with gr.Column():
|
118 |
query_match_out_2 = gr.Textbox(label=f"Search Result 2")
|
119 |
|
120 |
with gr.Column():
|
121 |
-
with gr.Accordion("Click to View Source", open=False):
|
122 |
-
|
123 |
-
label=f"Source Url"
|
124 |
-
)
|
125 |
-
|
126 |
translate_btn_2 = gr.Button(
|
127 |
label="Translate Text",
|
128 |
value="Translate Text",
|
@@ -133,15 +121,18 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
133 |
|
134 |
)
|
135 |
|
|
|
|
|
|
|
|
|
|
|
136 |
with gr.Row():
|
137 |
with gr.Column():
|
138 |
query_match_out_3 = gr.Textbox(label=f"Search Result 3")
|
139 |
|
140 |
with gr.Column():
|
141 |
-
with gr.Accordion("Click to View Source", open=False):
|
142 |
-
|
143 |
-
label=f"Source Url"
|
144 |
-
)
|
145 |
translate_btn_3 = gr.Button(
|
146 |
label="Translate Text",
|
147 |
value="Translate Text",
|
@@ -150,6 +141,9 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
150 |
translate_res_3= gr.Textbox(
|
151 |
label=f"Translation in English",
|
152 |
)
|
|
|
|
|
|
|
153 |
|
154 |
with gr.TabItem("Q&A"):
|
155 |
gr.HTML(
|
@@ -179,8 +173,6 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
179 |
)
|
180 |
clear = gr.Button("Clear", variant="primary")
|
181 |
|
182 |
-
|
183 |
-
|
184 |
with gr.TabItem("Summarize"):
|
185 |
gr.HTML(
|
186 |
"""<p style="text-align:center; font-size:24px;"><b> Get the most out of your study materials!</p>"""
|
@@ -246,7 +238,50 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
246 |
with gr.Row():
|
247 |
generate_output = gr.Text(label="Generated questions", lines=5)
|
248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
# fetch answer for submitted question corresponding to input document
|
251 |
input_question.submit(
|
252 |
get_user_input,
|
@@ -303,17 +338,16 @@ with gr.Blocks(theme=custom_theme) as demo:
|
|
303 |
queue=False,
|
304 |
)
|
305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
# clear the chatbot Q&A history when this button is clicked by the user
|
307 |
clear.click(lambda: None, None, chatbot, queue=False)
|
308 |
|
309 |
-
# # run search as user is typing the query
|
310 |
-
# user_query.change(
|
311 |
-
# cross_lingual_document_search,
|
312 |
-
# [user_query, num_search_results, lang_choices, text_match],
|
313 |
-
# [query_match_out_1, query_match_out_2, query_match_out_3],
|
314 |
-
# queue=False,
|
315 |
-
# )
|
316 |
-
|
317 |
# run search if user submits query
|
318 |
user_query.submit(
|
319 |
cross_lingual_document_search,
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from src.document_utils import (
|
3 |
summarize,
|
|
|
5 |
generate_questions,
|
6 |
load_history,
|
7 |
load_science,
|
8 |
+
paraphrase
|
9 |
)
|
10 |
from src.wiki_search import cross_lingual_document_search, translate_text
|
11 |
from src.theme import CustomTheme
|
|
|
26 |
bot_message = question_answer(input_document, history)
|
27 |
history[-1][1] = bot_message
|
28 |
return history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
|
31 |
custom_theme = CustomTheme()
|
|
|
42 |
|
43 |
with gr.TabItem("Document Search"):
|
44 |
gr.HTML(
|
45 |
+
"""<p style="text-align:center;font-size:24px;"><b>Search across a library of study materials in your own native language or even a mix of languages.</p>"""
|
46 |
)
|
47 |
gr.HTML(
|
48 |
"""<p style="text-align:center; font-style:italic; font-size:16px;">Get started with a pre-indexed set of study materials spaning various subjects (History, Literature, Philosophy, Government etc) in 4 different languages.</p>"""
|
|
|
90 |
)
|
91 |
|
92 |
with gr.Column():
|
93 |
+
with gr.Accordion("Click to View Translation/Source", open=False):
|
|
|
|
|
|
|
|
|
|
|
94 |
translate_btn_1 = gr.Button(
|
95 |
label="Translate Text",
|
96 |
value="Translate Text",
|
|
|
100 |
label=f"Translation in English",
|
101 |
)
|
102 |
|
103 |
+
source_res_1 = gr.Textbox(
|
104 |
+
label=f"Source Url",
|
105 |
+
)
|
106 |
+
|
107 |
with gr.Row():
|
108 |
with gr.Column():
|
109 |
query_match_out_2 = gr.Textbox(label=f"Search Result 2")
|
110 |
|
111 |
with gr.Column():
|
112 |
+
with gr.Accordion("Click to View Translation/Source", open=False):
|
113 |
+
|
|
|
|
|
|
|
114 |
translate_btn_2 = gr.Button(
|
115 |
label="Translate Text",
|
116 |
value="Translate Text",
|
|
|
121 |
|
122 |
)
|
123 |
|
124 |
+
source_res_2 = gr.Textbox(
|
125 |
+
label=f"Source Url"
|
126 |
+
)
|
127 |
+
|
128 |
+
|
129 |
with gr.Row():
|
130 |
with gr.Column():
|
131 |
query_match_out_3 = gr.Textbox(label=f"Search Result 3")
|
132 |
|
133 |
with gr.Column():
|
134 |
+
with gr.Accordion("Click to View Translation/Source", open=False):
|
135 |
+
|
|
|
|
|
136 |
translate_btn_3 = gr.Button(
|
137 |
label="Translate Text",
|
138 |
value="Translate Text",
|
|
|
141 |
translate_res_3= gr.Textbox(
|
142 |
label=f"Translation in English",
|
143 |
)
|
144 |
+
source_res_3 = gr.Textbox(
|
145 |
+
label=f"Source Url"
|
146 |
+
)
|
147 |
|
148 |
with gr.TabItem("Q&A"):
|
149 |
gr.HTML(
|
|
|
173 |
)
|
174 |
clear = gr.Button("Clear", variant="primary")
|
175 |
|
|
|
|
|
176 |
with gr.TabItem("Summarize"):
|
177 |
gr.HTML(
|
178 |
"""<p style="text-align:center; font-size:24px;"><b> Get the most out of your study materials!</p>"""
|
|
|
238 |
with gr.Row():
|
239 |
generate_output = gr.Text(label="Generated questions", lines=5)
|
240 |
|
241 |
+
with gr.TabItem("Paraphrase"):
|
242 |
+
gr.HTML(
|
243 |
+
"""<p style="text-align:center;"><b>Paraphraser. Add your document below and generate a rephrase for it.</p>"""
|
244 |
+
)
|
245 |
+
|
246 |
+
with gr.Row():
|
247 |
+
with gr.Column():
|
248 |
+
paraphrase_input = gr.Text(label="Document", lines=10)
|
249 |
+
generate_paraphrase = gr.Button("Paraphrase", variant="primary")
|
250 |
+
|
251 |
+
with gr.Column():
|
252 |
+
paraphrase_output = gr.HTML(label="Paraphrase", lines=10)
|
253 |
+
invisible_comp = gr.Text(label="Dummy Component", visible=False)
|
254 |
|
255 |
+
with gr.Row():
|
256 |
+
with gr.Accordion("Advanced Settings:", open=False):
|
257 |
+
paraphrase_length = gr.Radio(
|
258 |
+
["short", "medium", "long"],
|
259 |
+
label="Paraphrase Length",
|
260 |
+
value="long",
|
261 |
+
)
|
262 |
+
paraphrase_format = gr.Radio(
|
263 |
+
["paragraph", "bullets"],
|
264 |
+
label="Paraphrase Format",
|
265 |
+
value="bullets",
|
266 |
+
)
|
267 |
+
extractiveness = gr.Radio(
|
268 |
+
["low", "medium", "high"],
|
269 |
+
label="Extractiveness",
|
270 |
+
info="Controls how close to the original text the paraphrase is.",
|
271 |
+
visible=False,
|
272 |
+
value="high",
|
273 |
+
)
|
274 |
+
temperature = gr.Slider(
|
275 |
+
minimum=0,
|
276 |
+
maximum=5.0,
|
277 |
+
value=0.64,
|
278 |
+
step=0.1,
|
279 |
+
interactive=True,
|
280 |
+
visible=False,
|
281 |
+
label="Temperature",
|
282 |
+
info="Controls the randomness of the output. Lower values tend to generate more “predictable” output, while higher values tend to generate more “creative” output.",
|
283 |
+
)
|
284 |
+
|
285 |
# fetch answer for submitted question corresponding to input document
|
286 |
input_question.submit(
|
287 |
get_user_input,
|
|
|
338 |
queue=False,
|
339 |
)
|
340 |
|
341 |
+
generate_paraphrase.click(
|
342 |
+
paraphrase,
|
343 |
+
[paraphrase_input],
|
344 |
+
[paraphrase_output],
|
345 |
+
queue=False,
|
346 |
+
)
|
347 |
+
|
348 |
# clear the chatbot Q&A history when this button is clicked by the user
|
349 |
clear.click(lambda: None, None, chatbot, queue=False)
|
350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
# run search if user submits query
|
352 |
user_query.submit(
|
353 |
cross_lingual_document_search,
|
src/document_utils.py
CHANGED
@@ -4,6 +4,7 @@ import sys
|
|
4 |
import pandas as pd
|
5 |
from typing import List
|
6 |
import pinecone
|
|
|
7 |
|
8 |
import cohere
|
9 |
from langchain.embeddings.cohere import CohereEmbeddings
|
@@ -17,6 +18,7 @@ sys.path.append(os.path.abspath('..'))
|
|
17 |
from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
|
18 |
|
19 |
|
|
|
20 |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
21 |
PINECONE_ENV = os.environ.get("PINECONE_ENV")
|
22 |
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
@@ -120,14 +122,40 @@ def question_answer(input_document: str, history: List) -> str:
|
|
120 |
return answer
|
121 |
|
122 |
def generate_questions(input_document: str) -> str:
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
|
133 |
def load_science():
|
@@ -143,6 +171,51 @@ def load_history():
|
|
143 |
sample_question = examples_df["question"].iloc[1]
|
144 |
return history_doc, sample_question
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
if __name__ == "__main__":
|
148 |
with open('sample_text.txt', 'r') as file:
|
|
|
4 |
import pandas as pd
|
5 |
from typing import List
|
6 |
import pinecone
|
7 |
+
import difflib
|
8 |
|
9 |
import cohere
|
10 |
from langchain.embeddings.cohere import CohereEmbeddings
|
|
|
18 |
from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
|
19 |
|
20 |
|
21 |
+
|
22 |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
23 |
PINECONE_ENV = os.environ.get("PINECONE_ENV")
|
24 |
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
|
|
122 |
return answer
|
123 |
|
124 |
def generate_questions(input_document: str) -> str:
|
125 |
+
co = cohere.Client(COHERE_API_KEY)
|
126 |
+
prompt = f"""Write five different questions to test the understanding of the following text. The questions should be short answer, with one or two words each, and vary in difficulty from easy to hard. Provide the correct answer for each question after the question.
|
127 |
+
Now write your own questions for this text:
|
128 |
+
|
129 |
+
Text: {input_document}
|
130 |
+
|
131 |
+
Question 1: (question_1)
|
132 |
+
Answer: (answer_1)
|
133 |
+
|
134 |
+
Question 2: (question_2)
|
135 |
+
Answer: (answer_2)
|
136 |
+
|
137 |
+
Question 3: (question_3)
|
138 |
+
Answer: (answer_3)
|
139 |
+
|
140 |
+
Question 4: (question_4)
|
141 |
+
Answer: (answer_4)
|
142 |
+
|
143 |
+
Question 5: (question_5)
|
144 |
+
Answer: (answer_5)"""
|
145 |
+
|
146 |
+
|
147 |
+
response = co.generate(model='command', prompt=prompt, temperature=2, max_tokens=1000, )
|
148 |
+
|
149 |
+
answer = response.generations[0].text.strip()
|
150 |
+
print(answer)
|
151 |
+
questions = answer.split('\n\n')
|
152 |
+
print(questions)
|
153 |
+
result = {}
|
154 |
+
for question in questions:
|
155 |
+
q, a = question.split('\n')
|
156 |
+
result[q] = a.split(': ')[1]
|
157 |
+
|
158 |
+
return answer
|
159 |
|
160 |
|
161 |
def load_science():
|
|
|
171 |
sample_question = examples_df["question"].iloc[1]
|
172 |
return history_doc, sample_question
|
173 |
|
174 |
+
def show_diff_html(seqm):
|
175 |
+
"""Unify operations between two compared strings
|
176 |
+
seqm is a difflib.SequenceMatcher instance whose a & b are strings
|
177 |
+
"""
|
178 |
+
output = []
|
179 |
+
for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
|
180 |
+
if opcode == 'equal':
|
181 |
+
output.append(seqm.b[b0:b1])
|
182 |
+
elif opcode == 'insert':
|
183 |
+
output.append(f"<span style='background-color:lime;'>{seqm.b[b0:b1]}</span>")
|
184 |
+
# elif opcode == 'delete':
|
185 |
+
# output.append(f"<span style='background-color:red;'>{seqm.a[a0:a1]}</span>")
|
186 |
+
elif opcode == 'replace':
|
187 |
+
# output.append(f"<span style='background-color:red;'>{seqm.a[a0:a1]}</span>")
|
188 |
+
output.append(f"<span style='background-color:lime;'>{seqm.b[b0:b1]}</span>")
|
189 |
+
else:
|
190 |
+
if opcode == 'delete' or opcode == 'replace':
|
191 |
+
continue
|
192 |
+
raise RuntimeError("unexpected opcode")
|
193 |
+
return ''.join(output)
|
194 |
+
|
195 |
+
# define a function to paraphrase text using Cohere API
|
196 |
+
def paraphrase(text):
|
197 |
+
# create a cohere client with your API key
|
198 |
+
client = cohere.Client(api_key=COHERE_API_KEY)
|
199 |
+
|
200 |
+
# set the prompt for paraphrasing
|
201 |
+
prompt = f"Rephrase this sentence in a different way: {text}"
|
202 |
+
|
203 |
+
# generate a response using the multilingual-22-12 model
|
204 |
+
response = client.generate(
|
205 |
+
model="command-nightly",
|
206 |
+
prompt=prompt,
|
207 |
+
max_tokens=1000,
|
208 |
+
|
209 |
+
)
|
210 |
+
# get the generated text
|
211 |
+
rephrased_text = response[0].text
|
212 |
+
print(rephrased_text)
|
213 |
+
|
214 |
+
# compare the original and rephrased texts using difflib
|
215 |
+
sm = difflib.SequenceMatcher(None, text, rephrased_text)
|
216 |
+
html = show_diff_html(sm)
|
217 |
+
|
218 |
+
return html
|
219 |
|
220 |
if __name__ == "__main__":
|
221 |
with open('sample_text.txt', 'r') as file:
|
src/document_utils_v2.py
DELETED
@@ -1,151 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import sys
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
from typing import List
|
6 |
-
|
7 |
-
import cohere
|
8 |
-
from langchain.embeddings.cohere import CohereEmbeddings
|
9 |
-
from langchain.llms import Cohere
|
10 |
-
from langchain.prompts import PromptTemplate
|
11 |
-
from langchain.vectorstores import Qdrant
|
12 |
-
from langchain.chains.question_answering import load_qa_chain
|
13 |
-
|
14 |
-
sys.path.append(os.path.abspath('..'))
|
15 |
-
|
16 |
-
from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
QDRANT_HOST = os.environ.get("QDRANT_HOST")
|
21 |
-
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
|
22 |
-
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
23 |
-
|
24 |
-
|
25 |
-
def replace_text(text):
|
26 |
-
if text.startswith("The answer is "):
|
27 |
-
text = text.replace("The answer is ", "", 1)
|
28 |
-
return text
|
29 |
-
|
30 |
-
|
31 |
-
def summarize(
|
32 |
-
document: str,
|
33 |
-
summary_length: str,
|
34 |
-
summary_format: str,
|
35 |
-
extractiveness: str = "high",
|
36 |
-
temperature: float = 0.6,
|
37 |
-
) -> str:
|
38 |
-
"""
|
39 |
-
Generates a summary for the input document using Cohere's summarize API.
|
40 |
-
Args:
|
41 |
-
document (`str`):
|
42 |
-
The document given by the user for which summary must be generated.
|
43 |
-
summary_length (`str`):
|
44 |
-
A value such as 'short', 'medium', 'long' indicating the length of the summary.
|
45 |
-
summary_format (`str`):
|
46 |
-
This indicates whether the generated summary should be in 'paragraph' format or 'bullets'.
|
47 |
-
extractiveness (`str`, *optional*, defaults to 'high'):
|
48 |
-
A value such as 'low', 'medium', 'high' indicating how close the generated summary should be in meaning to the original text.
|
49 |
-
temperature (`str`):
|
50 |
-
This controls the randomness of the output. Lower values tend to generate more “predictable” output, while higher values tend to generate more “creative” output.
|
51 |
-
Returns:
|
52 |
-
generated_summary (`str`):
|
53 |
-
The generated summary from the summarization model.
|
54 |
-
"""
|
55 |
-
|
56 |
-
summary_response = cohere.Client(COHERE_API_KEY).summarize(
|
57 |
-
text=document,
|
58 |
-
length=summary_length,
|
59 |
-
format=summary_format,
|
60 |
-
model=SUMMARIZATION_MODEL,
|
61 |
-
extractiveness=extractiveness,
|
62 |
-
temperature=temperature,
|
63 |
-
)
|
64 |
-
generated_summary = summary_response.summary
|
65 |
-
return generated_summary
|
66 |
-
|
67 |
-
|
68 |
-
def question_answer(input_document: str, history: List) -> str:
|
69 |
-
"""
|
70 |
-
Generates an appropriate answer for the question asked by the user based on the input document.
|
71 |
-
Args:
|
72 |
-
input_document (`str`):
|
73 |
-
The document given by the user for which summary must be generated.
|
74 |
-
history (`List[List[str,str]]`):
|
75 |
-
A list made up of pairs of input question asked by the user & corresponding generated answers. It is used to keep track of the history of the chat between the user and the model.
|
76 |
-
Returns:
|
77 |
-
answer (`str`):
|
78 |
-
The generated answer corresponding to the input question and document received from the user.
|
79 |
-
"""
|
80 |
-
context = input_document
|
81 |
-
# The last element of the `history` list contains the most recent question asked by the user whose answer needs to be generated.
|
82 |
-
question = history[-1][0]
|
83 |
-
word_list = context.split()
|
84 |
-
# texts = [context[k : k + 256] for k in range(0, len(context.split()), 256)]
|
85 |
-
texts = [" ".join(word_list[k : k + 256]) for k in range(0, len(word_list), 256)]
|
86 |
-
|
87 |
-
# print(texts)
|
88 |
-
|
89 |
-
embeddings = CohereEmbeddings(
|
90 |
-
model="multilingual-22-12", cohere_api_key=COHERE_API_KEY
|
91 |
-
)
|
92 |
-
context_index = Qdrant.from_texts(
|
93 |
-
texts, embeddings, url=QDRANT_HOST, api_key=QDRANT_API_KEY
|
94 |
-
)
|
95 |
-
|
96 |
-
prompt_template = """Text: {context}
|
97 |
-
Question: {question}
|
98 |
-
Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available."""
|
99 |
-
|
100 |
-
PROMPT = PromptTemplate(
|
101 |
-
template=prompt_template, input_variables=["context", "question"]
|
102 |
-
)
|
103 |
-
|
104 |
-
# Generate the answer given the context
|
105 |
-
chain = load_qa_chain(
|
106 |
-
Cohere(
|
107 |
-
model="command-xlarge-nightly", temperature=0, cohere_api_key=COHERE_API_KEY
|
108 |
-
),
|
109 |
-
chain_type="stuff",
|
110 |
-
prompt=PROMPT,
|
111 |
-
)
|
112 |
-
relevant_context = context_index.similarity_search(question)
|
113 |
-
answer = chain.run(input_documents=relevant_context, question=question)
|
114 |
-
answer = answer.replace("\n", "").replace("Answer:", "")
|
115 |
-
answer = replace_text(answer)
|
116 |
-
return answer
|
117 |
-
|
118 |
-
def generate_questions(input_document: str) -> str:
|
119 |
-
generated_response = cohere.Client(COHERE_API_KEY).generate(
|
120 |
-
prompt = f"Give me 5 different questions to test understanding of the following text provided. Here's the provided text: {input_document}. Now what is Questions 1 to 5 ?:",
|
121 |
-
max_tokens = 200,
|
122 |
-
temperature = 0.55
|
123 |
-
)
|
124 |
-
# prompt = f"Generate 5 different quiz questions to test the understanding of the following text. Here's the provided text: {input_document}. Whats Questions 1 to 5 of the quiz ?:"
|
125 |
-
# print(prompt)
|
126 |
-
return generated_response.generations[0].text
|
127 |
-
|
128 |
-
|
129 |
-
def load_science():
|
130 |
-
examples_df = pd.read_csv(EXAMPLES_FILE_PATH)
|
131 |
-
science_doc = examples_df["doc"].iloc[0]
|
132 |
-
sample_question = examples_df["question"].iloc[0]
|
133 |
-
return science_doc, sample_question
|
134 |
-
|
135 |
-
|
136 |
-
def load_history():
|
137 |
-
examples_df = pd.read_csv(EXAMPLES_FILE_PATH)
|
138 |
-
history_doc = examples_df["doc"].iloc[1]
|
139 |
-
sample_question = examples_df["question"].iloc[1]
|
140 |
-
return history_doc, sample_question
|
141 |
-
|
142 |
-
|
143 |
-
if __name__ == "__main__":
|
144 |
-
with open('sample_text.txt', 'r') as file:
|
145 |
-
text = file.read()
|
146 |
-
# summary = summarize(text, summary_length="short", summary_format="bullets")
|
147 |
-
# print(summary)
|
148 |
-
# answer = question_answer(text, [["what is photosynthesis", None]])
|
149 |
-
# print(answer)
|
150 |
-
question = question_answer(text, ["Whats photosynthesis"])
|
151 |
-
print(question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/wiki_search.py
CHANGED
@@ -5,7 +5,6 @@ import pinecone
|
|
5 |
from easygoogletranslate import EasyGoogleTranslate
|
6 |
|
7 |
|
8 |
-
# load environment variables
|
9 |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
10 |
PINECONE_ENV = os.environ.get("PINECONE_ENV")
|
11 |
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
@@ -87,7 +86,9 @@ def cross_lingual_document_search(
|
|
87 |
)
|
88 |
|
89 |
results = [result['title']+"\n"+result['text'] for result in metadata]
|
|
|
90 |
|
|
|
91 |
url_list = [result['url'] + "\n\n" for result in metadata]
|
92 |
|
93 |
return results + url_list
|
@@ -113,12 +114,14 @@ def document_source(
|
|
113 |
|
114 |
return results
|
115 |
|
116 |
-
|
117 |
def translate_text(doc):
|
118 |
doc = " ".join(doc.split()[:4800])
|
119 |
result = translator.translate(doc, target_language='en')
|
120 |
return result
|
121 |
|
|
|
|
|
|
|
122 |
if __name__ == "__main__":
|
123 |
# query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
|
124 |
# result = search_wiki_for_query(query_embedding,user_query=user_query)
|
|
|
5 |
from easygoogletranslate import EasyGoogleTranslate
|
6 |
|
7 |
|
|
|
8 |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
9 |
PINECONE_ENV = os.environ.get("PINECONE_ENV")
|
10 |
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
|
|
86 |
)
|
87 |
|
88 |
results = [result['title']+"\n"+result['text'] for result in metadata]
|
89 |
+
url_list = [result['url'] + "\n\n" for result in metadata]
|
90 |
|
91 |
+
return results + url_list
|
92 |
url_list = [result['url'] + "\n\n" for result in metadata]
|
93 |
|
94 |
return results + url_list
|
|
|
114 |
|
115 |
return results
|
116 |
|
|
|
117 |
def translate_text(doc):
|
118 |
doc = " ".join(doc.split()[:4800])
|
119 |
result = translator.translate(doc, target_language='en')
|
120 |
return result
|
121 |
|
122 |
+
def translate_search_result():
|
123 |
+
pass
|
124 |
+
|
125 |
if __name__ == "__main__":
|
126 |
# query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
|
127 |
# result = search_wiki_for_query(query_embedding,user_query=user_query)
|
src/wiki_search_v2.py
DELETED
@@ -1,162 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import cohere
|
3 |
-
from typing import List
|
4 |
-
|
5 |
-
from qdrant_client import QdrantClient
|
6 |
-
from qdrant_client import models
|
7 |
-
|
8 |
-
|
9 |
-
# load environment variables
|
10 |
-
QDRANT_HOST = os.environ.get("QDRANT_HOST")
|
11 |
-
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
|
12 |
-
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
13 |
-
|
14 |
-
MODEL_NAME = "multilingual-22-12"
|
15 |
-
COLLECTION = "wiki-embed"
|
16 |
-
|
17 |
-
# create qdrant and cohere client
|
18 |
-
cohere_client = cohere.Client(COHERE_API_KEY)
|
19 |
-
|
20 |
-
qdrant_client = QdrantClient(
|
21 |
-
host=QDRANT_HOST,
|
22 |
-
api_key=QDRANT_API_KEY,
|
23 |
-
port = 443,
|
24 |
-
)
|
25 |
-
|
26 |
-
def embed_user_query(user_query):
|
27 |
-
|
28 |
-
embeddings = cohere_client.embed(
|
29 |
-
texts=[user_query],
|
30 |
-
model=MODEL_NAME,
|
31 |
-
)
|
32 |
-
query_embedding = embeddings.embeddings[0]
|
33 |
-
return query_embedding, user_query
|
34 |
-
|
35 |
-
|
36 |
-
def search_wiki_for_query(
|
37 |
-
query_embedding,
|
38 |
-
num_results = 3,
|
39 |
-
user_query= "",
|
40 |
-
languages = [],
|
41 |
-
match_text = None,
|
42 |
-
):
|
43 |
-
filters = []
|
44 |
-
|
45 |
-
language_mapping = {
|
46 |
-
"English": "en",
|
47 |
-
"Yoruba": "yo",
|
48 |
-
"Igbo": "ig",
|
49 |
-
"Hause": "ha",
|
50 |
-
}
|
51 |
-
|
52 |
-
# prepare filters to narrow down search results
|
53 |
-
# if the `match_text` list is not empty then create filter to find exact matching text in the documents
|
54 |
-
if match_text:
|
55 |
-
filters.append(
|
56 |
-
models.FieldCondition(
|
57 |
-
key="text",
|
58 |
-
match=models.MatchText(text=user_query),
|
59 |
-
)
|
60 |
-
)
|
61 |
-
|
62 |
-
# filter documents based on language before performing search:
|
63 |
-
if languages:
|
64 |
-
for lang in languages:
|
65 |
-
filters.append(
|
66 |
-
models.FieldCondition(
|
67 |
-
key="lang",
|
68 |
-
match=models.MatchValue(
|
69 |
-
value=language_mapping[lang],
|
70 |
-
),
|
71 |
-
)
|
72 |
-
)
|
73 |
-
|
74 |
-
# perform search and get results
|
75 |
-
results = qdrant_client.search(
|
76 |
-
collection_name=COLLECTION,
|
77 |
-
query_filter=models.Filter(should=filters),
|
78 |
-
search_params=models.SearchParams(hnsw_ef=128, exact=False),
|
79 |
-
query_vector=query_embedding,
|
80 |
-
limit=num_results,
|
81 |
-
)
|
82 |
-
return results
|
83 |
-
|
84 |
-
|
85 |
-
def cross_lingual_document_search(
|
86 |
-
user_input: str, num_results: int, languages, text_match
|
87 |
-
) -> List:
|
88 |
-
"""
|
89 |
-
Wrapper function for performing search on the collection of documents for the given user query.
|
90 |
-
Prepares query embedding, retrieves search results, checks if expected number of search results are being returned.
|
91 |
-
Args:
|
92 |
-
user_input (`str`):
|
93 |
-
The user input based on which search will be performed.
|
94 |
-
num_results (`str`):
|
95 |
-
The number of expected search results.
|
96 |
-
languages (`str`):
|
97 |
-
The list of languages based on which search results must be filtered.
|
98 |
-
text_match (`str`):
|
99 |
-
A field based on which it is decided whether to perform full-text-match while performing search.
|
100 |
-
Returns:
|
101 |
-
final_results (`List[str]`):
|
102 |
-
A list containing the final search results corresponding to the given user input.
|
103 |
-
"""
|
104 |
-
# create an embedding for the input query
|
105 |
-
query_embedding, _ = embed_user_query(user_input)
|
106 |
-
|
107 |
-
# retrieve search results
|
108 |
-
result = search_wiki_for_query(
|
109 |
-
query_embedding,
|
110 |
-
num_results,
|
111 |
-
user_input,
|
112 |
-
languages,
|
113 |
-
text_match,
|
114 |
-
)
|
115 |
-
final_results = [result[i].payload["text"] for i in range(len(result))]
|
116 |
-
|
117 |
-
# check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
|
118 |
-
if num_results > len(final_results):
|
119 |
-
remaining_inputs = num_results - len(final_results)
|
120 |
-
for input in range(remaining_inputs):
|
121 |
-
final_results.append("")
|
122 |
-
|
123 |
-
return final_results
|
124 |
-
|
125 |
-
def document_source(
|
126 |
-
user_input: str, num_results: int, languages, text_match
|
127 |
-
) -> List:
|
128 |
-
query_embedding, _ = embed_user_query(user_input)
|
129 |
-
|
130 |
-
# retrieve search results
|
131 |
-
result = search_wiki_for_query(
|
132 |
-
query_embedding,
|
133 |
-
num_results,
|
134 |
-
user_input,
|
135 |
-
languages,
|
136 |
-
text_match,
|
137 |
-
)
|
138 |
-
sources = [result[i].payload["url"] for i in range(len(result))]
|
139 |
-
|
140 |
-
# check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
|
141 |
-
if num_results > len(sources):
|
142 |
-
remaining_inputs = num_results - len(sources)
|
143 |
-
for input in range(remaining_inputs):
|
144 |
-
sources.append("")
|
145 |
-
|
146 |
-
return sources
|
147 |
-
|
148 |
-
|
149 |
-
def translate_search_result():
|
150 |
-
pass
|
151 |
-
|
152 |
-
if __name__ == "__main__":
|
153 |
-
# query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
|
154 |
-
# result = search_wiki_for_query(query_embedding,user_query=user_query)
|
155 |
-
|
156 |
-
# for item in result:
|
157 |
-
# print(item.payload["url"])
|
158 |
-
result = cross_lingual_document_search("Who is the president of Nigeria",
|
159 |
-
num_results=3,
|
160 |
-
languages=["Yoruba"],
|
161 |
-
text_match=False)
|
162 |
-
print(result, len(result))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|