Upload folder using huggingface_hub
Browse files- .gitignore +0 -3
- README.md +1 -1
- app.py +146 -45
- climateqa/engine/embeddings.py +2 -6
- climateqa/engine/text_retriever.py +0 -1
- climateqa/engine/vectorstore.py +45 -86
- logs/1715672103.255797.json +1 -0
- logs/1715673060.457813.json +1 -0
- logs/1715673178.788617.json +1 -0
- logs/1715675920.752972.json +1 -0
- requirements.txt +1 -2
- style.css +1 -73
- test +6 -3
- vectors/index.annoy +1 -1
- vectors/index.pkl +1 -1
.gitignore
CHANGED
@@ -9,9 +9,6 @@ setAPIKEY.sh
|
|
9 |
.AppleDouble
|
10 |
.LSOverride
|
11 |
|
12 |
-
# Historique conversasion with chatbot
|
13 |
-
*.json
|
14 |
-
|
15 |
# Icon must end with two \r
|
16 |
Icon
|
17 |
|
|
|
9 |
.AppleDouble
|
10 |
.LSOverride
|
11 |
|
|
|
|
|
|
|
12 |
# Icon must end with two \r
|
13 |
Icon
|
14 |
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
app_file: app.py
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.1
|
|
|
1 |
---
|
2 |
+
title: Clara
|
3 |
app_file: app.py
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.1
|
app.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
from climateqa.engine.
|
|
|
|
|
5 |
from climateqa.engine.text_retriever import ClimateQARetriever
|
6 |
from climateqa.engine.rag import make_rag_chain
|
7 |
from climateqa.engine.llm import get_llm
|
@@ -10,9 +12,11 @@ from datetime import datetime
|
|
10 |
import json
|
11 |
import re
|
12 |
import gradio as gr
|
|
|
13 |
from sentence_transformers import CrossEncoder
|
14 |
|
15 |
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
|
|
|
16 |
|
17 |
# Load environment variables in local mode
|
18 |
try:
|
@@ -22,9 +26,9 @@ except Exception as e:
|
|
22 |
pass
|
23 |
|
24 |
# Set up Gradio Theme
|
25 |
-
theme = gr.themes.
|
26 |
-
primary_hue="
|
27 |
-
secondary_hue="
|
28 |
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
|
29 |
"system-ui", "sans-serif"],
|
30 |
)
|
@@ -70,6 +74,9 @@ def serialize_docs(docs):
|
|
70 |
|
71 |
|
72 |
# Create vectorstore and retriever
|
|
|
|
|
|
|
73 |
vectorstore = build_vectores_stores("./sources")
|
74 |
llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
|
75 |
|
@@ -160,7 +167,7 @@ async def chat(query, history):
|
|
160 |
"answer": history[-1][1],
|
161 |
"time": timestamp,
|
162 |
}
|
163 |
-
|
164 |
|
165 |
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
|
166 |
|
@@ -178,7 +185,7 @@ def make_html_source(source, i):
|
|
178 |
<div class="card-content">
|
179 |
<div>
|
180 |
<div style="float:right;width 10%;position:relative;top:0px">
|
181 |
-
<a href='{meta['ax_url']}'
|
182 |
</div>
|
183 |
<div>
|
184 |
<h2>Extrait {i}</h2>
|
@@ -188,9 +195,9 @@ def make_html_source(source, i):
|
|
188 |
<p>{text_content}</p>
|
189 |
|
190 |
</div>
|
191 |
-
|
192 |
<span>{name}</span>
|
193 |
-
</div>
|
194 |
</div>
|
195 |
"""
|
196 |
|
@@ -206,6 +213,79 @@ def log_locally(file, logs):
|
|
206 |
f.write(logs_json)
|
207 |
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
# --------------------------------------------------------------------
|
210 |
# Gradio
|
211 |
# --------------------------------------------------------------------
|
@@ -226,13 +306,8 @@ What would you like to know today?
|
|
226 |
"""
|
227 |
|
228 |
|
229 |
-
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component"
|
230 |
-
|
231 |
-
gr.HTML("""
|
232 |
-
<img style="width:100px" src="file/assets/axionable.svg"/>
|
233 |
-
""", elem_classes="logo-axio ")
|
234 |
|
235 |
-
# TAB Clara
|
236 |
with gr.Tab("CLARA"):
|
237 |
|
238 |
with gr.Row(elem_id="chatbot-row"):
|
@@ -244,44 +319,57 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
|
|
244 |
|
245 |
with gr.Row(elem_id="input-message"):
|
246 |
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
|
247 |
-
|
248 |
-
|
249 |
|
250 |
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
|
251 |
|
252 |
-
with gr.
|
253 |
-
|
254 |
-
gr.HTML("<p>Sources</p>")
|
255 |
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
with gr.Tabs() as tabs:
|
264 |
-
None
|
265 |
|
|
|
|
|
|
|
|
|
266 |
|
267 |
-
# TAB A propos
|
268 |
-
with gr.Tab("À propos", elem_classes="max-height other-tabs"):
|
269 |
with gr.Row():
|
270 |
with gr.Column(scale=1):
|
271 |
-
gr.
|
272 |
-
|
273 |
-
|
|
|
|
|
|
|
|
|
|
|
274 |
|
|
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
# with gr.Row(elem_id="config-row"):
|
280 |
-
# with gr.Column(scale=1):
|
281 |
-
#
|
282 |
-
# for pdfName in get_PDF_Names_from_GCP():
|
283 |
-
# gr.Markdown( pdfName, elem_classes="a-propos")
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
def start_chat(query, history):
|
287 |
history = history + [(query, None)]
|
@@ -298,8 +386,21 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
|
|
298 |
)
|
299 |
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
demo.queue()
|
302 |
|
303 |
demo.launch(allowed_paths=["assets/download.png",
|
304 |
-
"assets/logo4.png",
|
305 |
-
|
|
|
1 |
+
#from climateqa.engine.vectorstore import get_pinecone_vectorstore,
|
2 |
+
from climateqa.engine.vectorstore import build_vectores_stores
|
3 |
+
from climateqa.engine.embeddings import get_embeddings_function
|
4 |
+
from climateqa.engine.rag import make_rag_papers_chain
|
5 |
+
from climateqa.engine.keywords import make_keywords_chain
|
6 |
+
from climateqa.sample_questions import QUESTIONS
|
7 |
from climateqa.engine.text_retriever import ClimateQARetriever
|
8 |
from climateqa.engine.rag import make_rag_chain
|
9 |
from climateqa.engine.llm import get_llm
|
|
|
12 |
import json
|
13 |
import re
|
14 |
import gradio as gr
|
15 |
+
from climateqa.papers.openalex import OpenAlex
|
16 |
from sentence_transformers import CrossEncoder
|
17 |
|
18 |
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
|
19 |
+
oa = OpenAlex()
|
20 |
|
21 |
# Load environment variables in local mode
|
22 |
try:
|
|
|
26 |
pass
|
27 |
|
28 |
# Set up Gradio Theme
|
29 |
+
theme = gr.themes.Base(
|
30 |
+
primary_hue="blue",
|
31 |
+
secondary_hue="red",
|
32 |
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
|
33 |
"system-ui", "sans-serif"],
|
34 |
)
|
|
|
74 |
|
75 |
|
76 |
# Create vectorstore and retriever
|
77 |
+
embeddings_function = get_embeddings_function()
|
78 |
+
|
79 |
+
#vectorstore = get_pinecone_vectorstore(embeddings_function)
|
80 |
vectorstore = build_vectores_stores("./sources")
|
81 |
llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
|
82 |
|
|
|
167 |
"answer": history[-1][1],
|
168 |
"time": timestamp,
|
169 |
}
|
170 |
+
log_locally(log_file, logs)
|
171 |
|
172 |
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
|
173 |
|
|
|
185 |
<div class="card-content">
|
186 |
<div>
|
187 |
<div style="float:right;width 10%;position:relative;top:0px">
|
188 |
+
<a href='{meta['ax_url']}'><img style="width:20px" src='/file/assets/download.png' /></a>
|
189 |
</div>
|
190 |
<div>
|
191 |
<h2>Extrait {i}</h2>
|
|
|
195 |
<p>{text_content}</p>
|
196 |
|
197 |
</div>
|
198 |
+
<div class="card-footer">
|
199 |
<span>{name}</span>
|
200 |
+
</div>
|
201 |
</div>
|
202 |
"""
|
203 |
|
|
|
213 |
f.write(logs_json)
|
214 |
|
215 |
|
216 |
+
def generate_keywords(query):
|
217 |
+
chain = make_keywords_chain(llm)
|
218 |
+
keywords = chain.invoke(query)
|
219 |
+
keywords = " AND ".join(keywords["keywords"])
|
220 |
+
return keywords
|
221 |
+
|
222 |
+
|
223 |
+
papers_cols_widths = {
|
224 |
+
"doc": 50,
|
225 |
+
"id": 100,
|
226 |
+
"title": 300,
|
227 |
+
"doi": 100,
|
228 |
+
"publication_year": 100,
|
229 |
+
"abstract": 500,
|
230 |
+
"rerank_score": 100,
|
231 |
+
"is_oa": 50,
|
232 |
+
}
|
233 |
+
|
234 |
+
papers_cols = list(papers_cols_widths.keys())
|
235 |
+
papers_cols_widths = list(papers_cols_widths.values())
|
236 |
+
|
237 |
+
|
238 |
+
async def find_papers(query, keywords, after):
|
239 |
+
|
240 |
+
summary = ""
|
241 |
+
|
242 |
+
df_works = oa.search(keywords, after=after)
|
243 |
+
df_works = df_works.dropna(subset=["abstract"])
|
244 |
+
df_works = oa.rerank(query, df_works, reranker)
|
245 |
+
df_works = df_works.sort_values("rerank_score", ascending=False)
|
246 |
+
G = oa.make_network(df_works)
|
247 |
+
|
248 |
+
height = "750px"
|
249 |
+
network = oa.show_network(
|
250 |
+
G, color_by="rerank_score", notebook=False, height=height)
|
251 |
+
network_html = network.generate_html()
|
252 |
+
|
253 |
+
network_html = network_html.replace("'", "\"")
|
254 |
+
css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
|
255 |
+
network_html = network_html + css_to_inject
|
256 |
+
|
257 |
+
network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
|
258 |
+
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
|
259 |
+
allow-scripts allow-same-origin allow-popups
|
260 |
+
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
|
261 |
+
allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
|
262 |
+
|
263 |
+
docs = df_works["content"].head(15).tolist()
|
264 |
+
|
265 |
+
df_works = df_works.reset_index(
|
266 |
+
drop=True).reset_index().rename(columns={"index": "doc"})
|
267 |
+
df_works["doc"] = df_works["doc"] + 1
|
268 |
+
df_works = df_works[papers_cols]
|
269 |
+
|
270 |
+
yield df_works, network_html, summary
|
271 |
+
|
272 |
+
chain = make_rag_papers_chain(llm)
|
273 |
+
result = chain.astream_log(
|
274 |
+
{"question": query, "docs": docs, "language": "English"})
|
275 |
+
path_answer = "/logs/StrOutputParser/streamed_output/-"
|
276 |
+
|
277 |
+
async for op in result:
|
278 |
+
|
279 |
+
op = op.ops[0]
|
280 |
+
|
281 |
+
if op['path'] == path_answer: # reforulated question
|
282 |
+
new_token = op['value'] # str
|
283 |
+
summary += new_token
|
284 |
+
else:
|
285 |
+
continue
|
286 |
+
yield df_works, network_html, summary
|
287 |
+
|
288 |
+
|
289 |
# --------------------------------------------------------------------
|
290 |
# Gradio
|
291 |
# --------------------------------------------------------------------
|
|
|
306 |
"""
|
307 |
|
308 |
|
309 |
+
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component") as demo:
|
|
|
|
|
|
|
|
|
310 |
|
|
|
311 |
with gr.Tab("CLARA"):
|
312 |
|
313 |
with gr.Row(elem_id="chatbot-row"):
|
|
|
319 |
|
320 |
with gr.Row(elem_id="input-message"):
|
321 |
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
|
322 |
+
scale=7, lines=1, interactive=True, elem_id="input-textbox")
|
|
|
323 |
|
324 |
with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
|
325 |
|
326 |
+
with gr.Tabs() as tabs:
|
|
|
|
|
327 |
|
328 |
+
with gr.Tab("Sources", elem_id="tab-citations", id=1):
|
329 |
+
sources_textbox = gr.HTML(
|
330 |
+
show_label=False, elem_id="sources-textbox")
|
331 |
+
docs_textbox = gr.State("")
|
332 |
|
333 |
+
# ---------------------------------------------------------------------------------------
|
334 |
+
# OTHER TABS
|
335 |
+
# ---------------------------------------------------------------------------------------
|
|
|
|
|
336 |
|
337 |
+
with gr.Tab("Figures", elem_id="tab-images", elem_classes="max-height other-tabs"):
|
338 |
+
gallery_component = gr.Gallery()
|
339 |
+
|
340 |
+
with gr.Tab("Papers (beta)", elem_id="tab-papers", elem_classes="max-height other-tabs"):
|
341 |
|
|
|
|
|
342 |
with gr.Row():
|
343 |
with gr.Column(scale=1):
|
344 |
+
query_papers = gr.Textbox(
|
345 |
+
placeholder="Question", show_label=False, lines=1, interactive=True, elem_id="query-papers")
|
346 |
+
keywords_papers = gr.Textbox(
|
347 |
+
placeholder="Keywords", show_label=False, lines=1, interactive=True, elem_id="keywords-papers")
|
348 |
+
after = gr.Slider(minimum=1950, maximum=2023, step=1, value=1960,
|
349 |
+
label="Publication date", show_label=True, interactive=True, elem_id="date-papers")
|
350 |
+
search_papers = gr.Button(
|
351 |
+
"Search", elem_id="search-papers", interactive=True)
|
352 |
|
353 |
+
with gr.Column(scale=7):
|
354 |
|
355 |
+
with gr.Tab("Summary", elem_id="papers-summary-tab"):
|
356 |
+
papers_summary = gr.Markdown(
|
357 |
+
visible=True, elem_id="papers-summary")
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
+
with gr.Tab("Relevant papers", elem_id="papers-results-tab"):
|
360 |
+
papers_dataframe = gr.Dataframe(
|
361 |
+
visible=True, elem_id="papers-table", headers=papers_cols)
|
362 |
+
|
363 |
+
with gr.Tab("Citations network", elem_id="papers-network-tab"):
|
364 |
+
citations_network = gr.HTML(
|
365 |
+
visible=True, elem_id="papers-citations-network")
|
366 |
+
|
367 |
+
with gr.Tab("À propos", elem_classes="max-height other-tabs"):
|
368 |
+
with gr.Row():
|
369 |
+
with gr.Column(scale=1):
|
370 |
+
gr.Markdown(
|
371 |
+
"CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
|
372 |
+
"– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)")
|
373 |
|
374 |
def start_chat(query, history):
|
375 |
history = history + [(query, None)]
|
|
|
386 |
)
|
387 |
|
388 |
|
389 |
+
|
390 |
+
def change_sample_questions(key):
|
391 |
+
index = list(QUESTIONS.keys()).index(key)
|
392 |
+
visible_bools = [False] * len(samples)
|
393 |
+
visible_bools[index] = True
|
394 |
+
return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
|
395 |
+
|
396 |
+
# dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
|
397 |
+
|
398 |
+
query_papers.submit(generate_keywords, [query_papers], [keywords_papers])
|
399 |
+
search_papers.click(find_papers, [query_papers, keywords_papers, after], [
|
400 |
+
papers_dataframe, citations_network, papers_summary])
|
401 |
+
|
402 |
demo.queue()
|
403 |
|
404 |
demo.launch(allowed_paths=["assets/download.png",
|
405 |
+
"assets/logo4.png"],
|
406 |
+
favicon_path="assets/logo4.png")
|
climateqa/engine/embeddings.py
CHANGED
@@ -8,12 +8,8 @@ def get_embeddings_function(version = "v1.2"):
|
|
8 |
|
9 |
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
-
|
12 |
-
|
13 |
-
# https://huggingface.co/BAAI/bge-m3
|
14 |
-
# A better one from 2024-04
|
15 |
-
model_name = "BAAI/bge-m3"
|
16 |
-
|
17 |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
18 |
print("Loading embeddings model: ", model_name)
|
19 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
|
|
8 |
|
9 |
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
10 |
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
11 |
+
|
12 |
+
model_name = "BAAI/bge-base-en-v1.5"
|
|
|
|
|
|
|
|
|
13 |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
14 |
print("Loading embeddings model: ", model_name)
|
15 |
embeddings_function = HuggingFaceBgeEmbeddings(
|
climateqa/engine/text_retriever.py
CHANGED
@@ -45,4 +45,3 @@ class ClimateQARetriever(BaseRetriever):
|
|
45 |
doc.metadata["page_number"] = 1
|
46 |
results.append(doc)
|
47 |
return results
|
48 |
-
|
|
|
45 |
doc.metadata["page_number"] = 1
|
46 |
results.append(doc)
|
47 |
return results
|
|
climateqa/engine/vectorstore.py
CHANGED
@@ -1,94 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
-
|
3 |
-
#
|
4 |
-
|
5 |
-
|
|
|
|
|
6 |
|
7 |
-
from langchain_pinecone import PineconeVectorStore
|
8 |
|
9 |
-
|
10 |
-
from langchain_text_splitters import CharacterTextSplitter
|
11 |
-
from climateqa.engine.embeddings import get_embeddings_function
|
12 |
-
embeddings_function = get_embeddings_function()
|
13 |
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
18 |
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
import os
|
21 |
import pdfplumber
|
22 |
|
23 |
-
def get_PDF_Names_from_GCP():
|
24 |
-
|
25 |
-
listName = []
|
26 |
-
# Récupération des fichier depuis GCP storage
|
27 |
-
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
28 |
-
for blob in blobs:
|
29 |
-
listName.append(blob.name)
|
30 |
-
return listName
|
31 |
-
|
32 |
-
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
|
33 |
-
|
34 |
-
# Récupération des fichier depuis GCP storage
|
35 |
-
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
|
36 |
-
for blob in blobs:
|
37 |
-
|
38 |
-
print( "\n"+blob.name+":")
|
39 |
-
print( " <- Téléchargement Depuis GCP")
|
40 |
-
blob.download_to_filename(pdf_folder+"/"+blob.name)
|
41 |
-
|
42 |
-
# Extraction des textes dpuis les fichiers PDF
|
43 |
-
print(" >>> Extraction PDF")
|
44 |
-
for pdf_file in os.listdir(pdf_folder):
|
45 |
-
if pdf_file.startswith("."):
|
46 |
-
continue
|
47 |
-
print(" > "+pdf_folder+"/"+pdf_file)
|
48 |
-
pdf_total_pages = 0
|
49 |
-
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
50 |
-
pdf_total_pages = len(pdf.pages)
|
51 |
-
|
52 |
-
# Fuite mémoire pour les gros fichiers
|
53 |
-
# Reouvrir le fichier à chaque N page semble rélgler le problème
|
54 |
-
N_page = 300
|
55 |
-
page_number = 0
|
56 |
-
while page_number < pdf_total_pages:
|
57 |
-
|
58 |
-
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
|
59 |
-
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
60 |
-
|
61 |
-
npage = 0
|
62 |
-
while (npage < N_page and page_number < pdf_total_pages) :
|
63 |
-
|
64 |
-
print(" >>> "+str(page_number+1))
|
65 |
-
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
|
66 |
-
for char_pdf in pdf.pages[page_number].chars:
|
67 |
-
f.write(char_pdf["text"])
|
68 |
-
f.close()
|
69 |
-
|
70 |
-
npage = npage + 1
|
71 |
-
page_number = page_number + 1
|
72 |
-
|
73 |
-
|
74 |
-
print(" X removing: " + blob.name )
|
75 |
-
os.remove(pdf_folder+"/"+blob.name)
|
76 |
-
|
77 |
-
|
78 |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
embedding=embeddings_function,
|
83 |
-
#namespace=namespace
|
84 |
-
)
|
85 |
-
|
86 |
-
return vectorstore
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
docs = []
|
91 |
-
|
92 |
for filename in os.listdir(folder_path):
|
93 |
if filename.startswith("."):
|
94 |
continue
|
@@ -99,17 +63,12 @@ def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vec
|
|
99 |
|
100 |
for doc in documents:
|
101 |
if (doc.metadata):
|
102 |
-
doc.metadata["ax_page"] = doc.metadata['source'].split("
|
103 |
-
doc.metadata["ax_name"] = doc.metadata['source'].split("
|
104 |
doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
|
105 |
|
106 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
107 |
docs += text_splitter.split_documents(documents)
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
return vectorstore
|
112 |
-
|
113 |
-
|
114 |
-
print("MISSING VECTORS")
|
115 |
-
exit(0)
|
|
|
1 |
+
# Pinecone
|
2 |
+
# More info at https://docs.pinecone.io/docs/langchain
|
3 |
+
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
4 |
+
# import os
|
5 |
+
# from pinecone import Pinecone
|
6 |
+
# from langchain_community.vectorstores import Pinecone as PineconeVectorstore
|
7 |
|
8 |
+
# # LOAD ENVIRONMENT VARIABLES
|
9 |
+
# try:
|
10 |
+
# from dotenv import load_dotenv
|
11 |
+
# load_dotenv()
|
12 |
+
# except:
|
13 |
+
# pass
|
14 |
|
|
|
15 |
|
16 |
+
# def get_pinecone_vectorstore(embeddings,text_key = "content"):
|
|
|
|
|
|
|
17 |
|
18 |
+
# pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
19 |
+
# index = pc.Index(os.getenv("PINECONE_API_INDEX"))
|
20 |
|
21 |
+
# vectorstore = PineconeVectorstore(
|
22 |
+
# index, embeddings, text_key,
|
23 |
+
# )
|
24 |
+
# return vectorstore
|
25 |
|
26 |
|
27 |
+
from langchain_community.vectorstores import Annoy
|
28 |
+
from langchain_community.document_loaders import TextLoader
|
29 |
+
from langchain_text_splitters import CharacterTextSplitter
|
30 |
+
from climateqa.engine.embeddings import get_embeddings_function
|
31 |
+
embeddings_function = get_embeddings_function()
|
32 |
import os
|
33 |
import pdfplumber
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
|
36 |
|
37 |
+
if os.path.isfile(vectors_path+"/index.annoy"):
|
38 |
+
return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
# Extract text from PDF files
|
41 |
+
print("Extraction PDF ...")
|
42 |
+
for pdf_file in os.listdir(pdf_folder):
|
43 |
+
if pdf_file.startswith("."):
|
44 |
+
continue
|
45 |
+
print(" > "+pdf_folder+"/"+pdf_file)
|
46 |
+
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
|
47 |
+
for pdf_page in pdf.pages:
|
48 |
+
f = open(folder_path+"/"+pdf_file+" page "+str(pdf_page.page_number), "w")
|
49 |
+
# f.write(pdf_file+" page "+str(pdf_page.page_number))
|
50 |
+
for char_pdf in pdf_page.chars:
|
51 |
+
f.write(char_pdf["text"])
|
52 |
+
f.close()
|
53 |
|
54 |
docs = []
|
55 |
+
vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
|
56 |
for filename in os.listdir(folder_path):
|
57 |
if filename.startswith("."):
|
58 |
continue
|
|
|
63 |
|
64 |
for doc in documents:
|
65 |
if (doc.metadata):
|
66 |
+
doc.metadata["ax_page"] = doc.metadata['source'].split(" ")[-1]
|
67 |
+
doc.metadata["ax_name"] = doc.metadata['source'].split(" ")[0].split("/")[-1]
|
68 |
doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
|
69 |
|
70 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
71 |
docs += text_splitter.split_documents(documents)
|
72 |
+
vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
|
73 |
+
vector_store_from_docs.save_local(vectors_path)
|
74 |
+
return vector_store_from_docs
|
|
|
|
|
|
|
|
|
|
logs/1715672103.255797.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"user_id": "245d9442-2651-4578-8f63-4ed4145c0a40", "prompt": "quels risques physiques en 2024 ?", "query": "quels risques physiques en 2024 ?", "question": "What are the projected physical risks in 2024 related to climate change?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9, mais les extraits fournis ne contiennent pas d'informations sur les risques physiques projet\u00e9s en 2024 li\u00e9s au changement climatique. Je n'ai pas suffisamment d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715672103.255797"}
|
logs/1715673060.457813.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"user_id": "c936b5ac-affe-4df7-9f9c-6f78ae83525e", "prompt": "quels risques physiques ?", "query": "quels risques physiques ?", "question": "What are the physical risks associated with climate change?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9, mais les extraits fournis ne contiennent pas d'informations sur les risques physiques associ\u00e9s au changement climatique. Je n'ai pas assez d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715673060.457813"}
|
logs/1715673178.788617.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"user_id": "80fd8b5f-4cf2-430c-8c14-09b4acc7436c", "prompt": "quels risques physiques ?", "query": "quels risques physiques ?", "question": "What are the physical risks associated with climate change?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9e, mais les extraits fournis ne contiennent pas d'informations sur les risques physiques associ\u00e9s au changement climatique. Je n'ai pas suffisamment d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715673178.788617"}
|
logs/1715675920.752972.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"user_id": "d06c2c8a-ab31-4dcd-8d5c-69a4a678221b", "prompt": "quels sont les risques physiques en 2024 ?", "query": "quels sont les risques physiques en 2024 ?", "question": "What are the projected physical risks in 2024?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9e, mais les documents fournis ne contiennent pas d'informations sur les risques physiques projet\u00e9s en 2024. Je n'ai pas assez d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715675920.752972"}
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
google-cloud-storage==2.16.0
|
2 |
gradio==4.19.1
|
|
|
3 |
python-dotenv==1.0.0
|
4 |
langchain==0.1.10
|
5 |
langchain_openai==0.0.6
|
@@ -11,5 +11,4 @@ pyalex==0.13
|
|
11 |
networkx==3.2.1
|
12 |
pyvis==0.3.2
|
13 |
annoy==1.17.3
|
14 |
-
langchain_pinecone
|
15 |
pdfplumber
|
|
|
|
|
1 |
gradio==4.19.1
|
2 |
+
gunicorn==22.0.0
|
3 |
python-dotenv==1.0.0
|
4 |
langchain==0.1.10
|
5 |
langchain_openai==0.0.6
|
|
|
11 |
networkx==3.2.1
|
12 |
pyvis==0.3.2
|
13 |
annoy==1.17.3
|
|
|
14 |
pdfplumber
|
style.css
CHANGED
@@ -3,78 +3,6 @@
|
|
3 |
--user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
|
4 |
} */
|
5 |
|
6 |
-
.logo-axio {
|
7 |
-
float: right;
|
8 |
-
position: absolute;
|
9 |
-
right: 0px;
|
10 |
-
}
|
11 |
-
|
12 |
-
|
13 |
-
/* couleur text */
|
14 |
-
p {
|
15 |
-
color: black !important;
|
16 |
-
}
|
17 |
-
li {
|
18 |
-
color: black !important;
|
19 |
-
}
|
20 |
-
|
21 |
-
button.selected {
|
22 |
-
border-radius: 20px !important;
|
23 |
-
}
|
24 |
-
button:hover {
|
25 |
-
color: #ffc000 !important;
|
26 |
-
}
|
27 |
-
|
28 |
-
|
29 |
-
/* fond panels/blocks */
|
30 |
-
.panel {
|
31 |
-
background-color: #eeeeee !important;
|
32 |
-
border: 0px;
|
33 |
-
}
|
34 |
-
.block {
|
35 |
-
background-color: #eeeeee !important;
|
36 |
-
}
|
37 |
-
|
38 |
-
/* fond bot */
|
39 |
-
.bot {
|
40 |
-
background-color: #eeeeee !important;
|
41 |
-
}
|
42 |
-
|
43 |
-
/* avatar en debut de reponse */
|
44 |
-
.avatar-container {
|
45 |
-
align-self: baseline !important;
|
46 |
-
margin-top: 35px;
|
47 |
-
}
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
/* fond user */
|
52 |
-
.user {
|
53 |
-
background-color: #d2d2d2 !important;
|
54 |
-
}
|
55 |
-
textarea {
|
56 |
-
background-color: #d2d2d2 !important;
|
57 |
-
color: black !important;
|
58 |
-
}
|
59 |
-
|
60 |
-
|
61 |
-
/* fond app */
|
62 |
-
gradio-app {
|
63 |
-
background-color: #ffffff !important;
|
64 |
-
}
|
65 |
-
.gradio-container {
|
66 |
-
background-color: #ffffff !important;
|
67 |
-
max-width: 100% !important;
|
68 |
-
width: 100% !important;
|
69 |
-
}
|
70 |
-
|
71 |
-
|
72 |
-
.a-propos {
|
73 |
-
margin: 20px !important;
|
74 |
-
}
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
.telecharger {
|
79 |
border: 1px solid;
|
80 |
padding: 5px;
|
@@ -115,7 +43,7 @@ body.dark .warning-box * {
|
|
115 |
|
116 |
|
117 |
body.dark .tip-box * {
|
118 |
-
color:
|
119 |
}
|
120 |
|
121 |
|
|
|
3 |
--user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
|
4 |
} */
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
.telecharger {
|
7 |
border: 1px solid;
|
8 |
padding: 5px;
|
|
|
43 |
|
44 |
|
45 |
body.dark .tip-box * {
|
46 |
+
color:black !important;
|
47 |
}
|
48 |
|
49 |
|
test
CHANGED
@@ -19,7 +19,8 @@ ENV HOME=/home/user \
|
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
-
SYSTEM=spaces
|
|
|
23 |
|
24 |
# Set the working directory to the user's home directory
|
25 |
WORKDIR $HOME/app
|
@@ -27,6 +28,8 @@ WORKDIR $HOME/app
|
|
27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
COPY --chown=user . $HOME/app
|
29 |
|
30 |
-
CMD ["python","setup.py"]
|
31 |
|
32 |
-
CMD ["python", "app.py"]
|
|
|
|
|
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
21 |
GRADIO_THEME=huggingface \
|
22 |
+
SYSTEM=spaces \
|
23 |
+
PORT=7860
|
24 |
|
25 |
# Set the working directory to the user's home directory
|
26 |
WORKDIR $HOME/app
|
|
|
28 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
29 |
COPY --chown=user . $HOME/app
|
30 |
|
31 |
+
#CMD ["python","setup.py"]
|
32 |
|
33 |
+
#CMD ["python", "app.py"]
|
34 |
+
|
35 |
+
CMD gunicorn -b 0.0.0.0:$PORT app:demo
|
vectors/index.annoy
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2238984
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b94e9d486dbe3a9e2397672bda1d1c17198cca42a53afaa16ef8ecfcebd22fc9
|
3 |
size 2238984
|
vectors/index.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3223915
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4eb3d63539603642200f07f8fac2e290e94104fbbe4f4471dc663eff850263f6
|
3 |
size 3223915
|