Samiraxio commited on
Commit
e7e3a28
1 Parent(s): 0e83904

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -9,9 +9,6 @@ setAPIKEY.sh
9
  .AppleDouble
10
  .LSOverride
11
 
12
- # Historique conversasion with chatbot
13
- *.json
14
-
15
  # Icon must end with two \r
16
  Icon
17
 
 
9
  .AppleDouble
10
  .LSOverride
11
 
 
 
 
12
  # Icon must end with two \r
13
  Icon
14
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: clara
3
  app_file: app.py
4
  sdk: gradio
5
  sdk_version: 4.19.1
 
1
  ---
2
+ title: Clara
3
  app_file: app.py
4
  sdk: gradio
5
  sdk_version: 4.19.1
app.py CHANGED
@@ -1,7 +1,9 @@
1
-
2
-
3
- # , get_pinecone_vectorstore, find_similar_vectors
4
- from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP
 
 
5
  from climateqa.engine.text_retriever import ClimateQARetriever
6
  from climateqa.engine.rag import make_rag_chain
7
  from climateqa.engine.llm import get_llm
@@ -10,9 +12,11 @@ from datetime import datetime
10
  import json
11
  import re
12
  import gradio as gr
 
13
  from sentence_transformers import CrossEncoder
14
 
15
  reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 
16
 
17
  # Load environment variables in local mode
18
  try:
@@ -22,9 +26,9 @@ except Exception as e:
22
  pass
23
 
24
  # Set up Gradio Theme
25
- theme = gr.themes.Soft(
26
- primary_hue="yellow",
27
- secondary_hue="orange",
28
  font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
29
  "system-ui", "sans-serif"],
30
  )
@@ -70,6 +74,9 @@ def serialize_docs(docs):
70
 
71
 
72
  # Create vectorstore and retriever
 
 
 
73
  vectorstore = build_vectores_stores("./sources")
74
  llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
75
 
@@ -160,7 +167,7 @@ async def chat(query, history):
160
  "answer": history[-1][1],
161
  "time": timestamp,
162
  }
163
- #log_locally(log_file, logs)
164
 
165
  yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
166
 
@@ -178,7 +185,7 @@ def make_html_source(source, i):
178
  <div class="card-content">
179
  <div>
180
  <div style="float:right;width 10%;position:relative;top:0px">
181
- <a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
182
  </div>
183
  <div>
184
  <h2>Extrait {i}</h2>
@@ -188,9 +195,9 @@ def make_html_source(source, i):
188
  <p>{text_content}</p>
189
 
190
  </div>
191
- <!-- <div class="card-footer">
192
  <span>{name}</span>
193
- </div> -->
194
  </div>
195
  """
196
 
@@ -206,6 +213,79 @@ def log_locally(file, logs):
206
  f.write(logs_json)
207
 
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  # --------------------------------------------------------------------
210
  # Gradio
211
  # --------------------------------------------------------------------
@@ -226,13 +306,8 @@ What would you like to know today?
226
  """
227
 
228
 
229
- with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
230
-
231
- gr.HTML("""
232
- <img style="width:100px" src="file/assets/axionable.svg"/>
233
- """, elem_classes="logo-axio ")
234
 
235
- # TAB Clara
236
  with gr.Tab("CLARA"):
237
 
238
  with gr.Row(elem_id="chatbot-row"):
@@ -244,44 +319,57 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
244
 
245
  with gr.Row(elem_id="input-message"):
246
  textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
247
- scale=7, lines=1, interactive=True, elem_id="input-textbox")
248
-
249
 
250
  with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
251
 
252
- with gr.Column(scale=1, elem_id="tab-citations"):
253
-
254
- gr.HTML("<p>Sources</p>")
255
 
256
- sources_textbox = gr.HTML(
257
- show_label=False, elem_id="sources-textbox")
258
- docs_textbox = gr.State("")
 
259
 
260
- # l'object tabs est necessaire actuellement
261
- # J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
262
- # pendant que l'ia gènère une reponse ..
263
- with gr.Tabs() as tabs:
264
- None
265
 
 
 
 
 
266
 
267
- # TAB A propos
268
- with gr.Tab("À propos", elem_classes="max-height other-tabs"):
269
  with gr.Row():
270
  with gr.Column(scale=1):
271
- gr.Markdown(
272
- ("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
273
- "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
 
 
 
 
 
274
 
 
275
 
276
- # # TAB Configuration
277
- # with gr.Tab("Configuration"):
278
- #
279
- # with gr.Row(elem_id="config-row"):
280
- # with gr.Column(scale=1):
281
- #
282
- # for pdfName in get_PDF_Names_from_GCP():
283
- # gr.Markdown( pdfName, elem_classes="a-propos")
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  def start_chat(query, history):
287
  history = history + [(query, None)]
@@ -298,8 +386,21 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
298
  )
299
 
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  demo.queue()
302
 
303
  demo.launch(allowed_paths=["assets/download.png",
304
- "assets/logo4.png",
305
- "assets/axionable.svg"],favicon_path="assets/logo4.png")
 
1
+ #from climateqa.engine.vectorstore import get_pinecone_vectorstore,
2
+ from climateqa.engine.vectorstore import build_vectores_stores
3
+ from climateqa.engine.embeddings import get_embeddings_function
4
+ from climateqa.engine.rag import make_rag_papers_chain
5
+ from climateqa.engine.keywords import make_keywords_chain
6
+ from climateqa.sample_questions import QUESTIONS
7
  from climateqa.engine.text_retriever import ClimateQARetriever
8
  from climateqa.engine.rag import make_rag_chain
9
  from climateqa.engine.llm import get_llm
 
12
  import json
13
  import re
14
  import gradio as gr
15
+ from climateqa.papers.openalex import OpenAlex
16
  from sentence_transformers import CrossEncoder
17
 
18
  reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
19
+ oa = OpenAlex()
20
 
21
  # Load environment variables in local mode
22
  try:
 
26
  pass
27
 
28
  # Set up Gradio Theme
29
+ theme = gr.themes.Base(
30
+ primary_hue="blue",
31
+ secondary_hue="red",
32
  font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
33
  "system-ui", "sans-serif"],
34
  )
 
74
 
75
 
76
  # Create vectorstore and retriever
77
+ embeddings_function = get_embeddings_function()
78
+
79
+ #vectorstore = get_pinecone_vectorstore(embeddings_function)
80
  vectorstore = build_vectores_stores("./sources")
81
  llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
82
 
 
167
  "answer": history[-1][1],
168
  "time": timestamp,
169
  }
170
+ log_locally(log_file, logs)
171
 
172
  yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
173
 
 
185
  <div class="card-content">
186
  <div>
187
  <div style="float:right;width 10%;position:relative;top:0px">
188
+ <a href='{meta['ax_url']}'><img style="width:20px" src='/file/assets/download.png' /></a>
189
  </div>
190
  <div>
191
  <h2>Extrait {i}</h2>
 
195
  <p>{text_content}</p>
196
 
197
  </div>
198
+ <div class="card-footer">
199
  <span>{name}</span>
200
+ </div>
201
  </div>
202
  """
203
 
 
213
  f.write(logs_json)
214
 
215
 
216
+ def generate_keywords(query):
217
+ chain = make_keywords_chain(llm)
218
+ keywords = chain.invoke(query)
219
+ keywords = " AND ".join(keywords["keywords"])
220
+ return keywords
221
+
222
+
223
+ papers_cols_widths = {
224
+ "doc": 50,
225
+ "id": 100,
226
+ "title": 300,
227
+ "doi": 100,
228
+ "publication_year": 100,
229
+ "abstract": 500,
230
+ "rerank_score": 100,
231
+ "is_oa": 50,
232
+ }
233
+
234
+ papers_cols = list(papers_cols_widths.keys())
235
+ papers_cols_widths = list(papers_cols_widths.values())
236
+
237
+
238
+ async def find_papers(query, keywords, after):
239
+
240
+ summary = ""
241
+
242
+ df_works = oa.search(keywords, after=after)
243
+ df_works = df_works.dropna(subset=["abstract"])
244
+ df_works = oa.rerank(query, df_works, reranker)
245
+ df_works = df_works.sort_values("rerank_score", ascending=False)
246
+ G = oa.make_network(df_works)
247
+
248
+ height = "750px"
249
+ network = oa.show_network(
250
+ G, color_by="rerank_score", notebook=False, height=height)
251
+ network_html = network.generate_html()
252
+
253
+ network_html = network_html.replace("'", "\"")
254
+ css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
255
+ network_html = network_html + css_to_inject
256
+
257
+ network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
258
+ display-capture; encrypted-media;" sandbox="allow-modals allow-forms
259
+ allow-scripts allow-same-origin allow-popups
260
+ allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
261
+ allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
262
+
263
+ docs = df_works["content"].head(15).tolist()
264
+
265
+ df_works = df_works.reset_index(
266
+ drop=True).reset_index().rename(columns={"index": "doc"})
267
+ df_works["doc"] = df_works["doc"] + 1
268
+ df_works = df_works[papers_cols]
269
+
270
+ yield df_works, network_html, summary
271
+
272
+ chain = make_rag_papers_chain(llm)
273
+ result = chain.astream_log(
274
+ {"question": query, "docs": docs, "language": "English"})
275
+ path_answer = "/logs/StrOutputParser/streamed_output/-"
276
+
277
+ async for op in result:
278
+
279
+ op = op.ops[0]
280
+
281
+ if op['path'] == path_answer: # reforulated question
282
+ new_token = op['value'] # str
283
+ summary += new_token
284
+ else:
285
+ continue
286
+ yield df_works, network_html, summary
287
+
288
+
289
  # --------------------------------------------------------------------
290
  # Gradio
291
  # --------------------------------------------------------------------
 
306
  """
307
 
308
 
309
+ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component") as demo:
 
 
 
 
310
 
 
311
  with gr.Tab("CLARA"):
312
 
313
  with gr.Row(elem_id="chatbot-row"):
 
319
 
320
  with gr.Row(elem_id="input-message"):
321
  textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
322
+ scale=7, lines=1, interactive=True, elem_id="input-textbox")
 
323
 
324
  with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
325
 
326
+ with gr.Tabs() as tabs:
 
 
327
 
328
+ with gr.Tab("Sources", elem_id="tab-citations", id=1):
329
+ sources_textbox = gr.HTML(
330
+ show_label=False, elem_id="sources-textbox")
331
+ docs_textbox = gr.State("")
332
 
333
+ # ---------------------------------------------------------------------------------------
334
+ # OTHER TABS
335
+ # ---------------------------------------------------------------------------------------
 
 
336
 
337
+ with gr.Tab("Figures", elem_id="tab-images", elem_classes="max-height other-tabs"):
338
+ gallery_component = gr.Gallery()
339
+
340
+ with gr.Tab("Papers (beta)", elem_id="tab-papers", elem_classes="max-height other-tabs"):
341
 
 
 
342
  with gr.Row():
343
  with gr.Column(scale=1):
344
+ query_papers = gr.Textbox(
345
+ placeholder="Question", show_label=False, lines=1, interactive=True, elem_id="query-papers")
346
+ keywords_papers = gr.Textbox(
347
+ placeholder="Keywords", show_label=False, lines=1, interactive=True, elem_id="keywords-papers")
348
+ after = gr.Slider(minimum=1950, maximum=2023, step=1, value=1960,
349
+ label="Publication date", show_label=True, interactive=True, elem_id="date-papers")
350
+ search_papers = gr.Button(
351
+ "Search", elem_id="search-papers", interactive=True)
352
 
353
+ with gr.Column(scale=7):
354
 
355
+ with gr.Tab("Summary", elem_id="papers-summary-tab"):
356
+ papers_summary = gr.Markdown(
357
+ visible=True, elem_id="papers-summary")
 
 
 
 
 
358
 
359
+ with gr.Tab("Relevant papers", elem_id="papers-results-tab"):
360
+ papers_dataframe = gr.Dataframe(
361
+ visible=True, elem_id="papers-table", headers=papers_cols)
362
+
363
+ with gr.Tab("Citations network", elem_id="papers-network-tab"):
364
+ citations_network = gr.HTML(
365
+ visible=True, elem_id="papers-citations-network")
366
+
367
+ with gr.Tab("À propos", elem_classes="max-height other-tabs"):
368
+ with gr.Row():
369
+ with gr.Column(scale=1):
370
+ gr.Markdown(
371
+ "CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
372
+ "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)")
373
 
374
  def start_chat(query, history):
375
  history = history + [(query, None)]
 
386
  )
387
 
388
 
389
+
390
+ def change_sample_questions(key):
391
+ index = list(QUESTIONS.keys()).index(key)
392
+ visible_bools = [False] * len(samples)
393
+ visible_bools[index] = True
394
+ return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
395
+
396
+ # dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
397
+
398
+ query_papers.submit(generate_keywords, [query_papers], [keywords_papers])
399
+ search_papers.click(find_papers, [query_papers, keywords_papers, after], [
400
+ papers_dataframe, citations_network, papers_summary])
401
+
402
  demo.queue()
403
 
404
  demo.launch(allowed_paths=["assets/download.png",
405
+ "assets/logo4.png"],
406
+ favicon_path="assets/logo4.png")
climateqa/engine/embeddings.py CHANGED
@@ -8,12 +8,8 @@ def get_embeddings_function(version = "v1.2"):
8
 
9
  # https://huggingface.co/BAAI/bge-base-en-v1.5
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
- # model_name = "BAAI/bge-base-en-v1.5"
12
-
13
- # https://huggingface.co/BAAI/bge-m3
14
- # A better one from 2024-04
15
- model_name = "BAAI/bge-m3"
16
-
17
  encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
18
  print("Loading embeddings model: ", model_name)
19
  embeddings_function = HuggingFaceBgeEmbeddings(
 
8
 
9
  # https://huggingface.co/BAAI/bge-base-en-v1.5
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
+
12
+ model_name = "BAAI/bge-base-en-v1.5"
 
 
 
 
13
  encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
14
  print("Loading embeddings model: ", model_name)
15
  embeddings_function = HuggingFaceBgeEmbeddings(
climateqa/engine/text_retriever.py CHANGED
@@ -45,4 +45,3 @@ class ClimateQARetriever(BaseRetriever):
45
  doc.metadata["page_number"] = 1
46
  results.append(doc)
47
  return results
48
-
 
45
  doc.metadata["page_number"] = 1
46
  results.append(doc)
47
  return results
 
climateqa/engine/vectorstore.py CHANGED
@@ -1,94 +1,58 @@
 
 
 
 
 
 
1
 
2
- from google.cloud import storage
3
- #storage_client = storage.Client()
4
- storage_client = storage.Client.create_anonymous_client()
5
- bucket_name = "docs-axio-clara"
 
 
6
 
7
- from langchain_pinecone import PineconeVectorStore
8
 
9
- from langchain_community.document_loaders import TextLoader
10
- from langchain_text_splitters import CharacterTextSplitter
11
- from climateqa.engine.embeddings import get_embeddings_function
12
- embeddings_function = get_embeddings_function()
13
 
 
 
14
 
15
-
16
- index_name = "my-index"
17
- namespace = "my-namespace"
 
18
 
19
 
 
 
 
 
 
20
  import os
21
  import pdfplumber
22
 
23
- def get_PDF_Names_from_GCP():
24
-
25
- listName = []
26
- # Récupération des fichier depuis GCP storage
27
- blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
28
- for blob in blobs:
29
- listName.append(blob.name)
30
- return listName
31
-
32
- def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
33
-
34
- # Récupération des fichier depuis GCP storage
35
- blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
36
- for blob in blobs:
37
-
38
- print( "\n"+blob.name+":")
39
- print( " <- Téléchargement Depuis GCP")
40
- blob.download_to_filename(pdf_folder+"/"+blob.name)
41
-
42
- # Extraction des textes dpuis les fichiers PDF
43
- print(" >>> Extraction PDF")
44
- for pdf_file in os.listdir(pdf_folder):
45
- if pdf_file.startswith("."):
46
- continue
47
- print(" > "+pdf_folder+"/"+pdf_file)
48
- pdf_total_pages = 0
49
- with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
50
- pdf_total_pages = len(pdf.pages)
51
-
52
- # Fuite mémoire pour les gros fichiers
53
- # Reouvrir le fichier à chaque N page semble rélgler le problème
54
- N_page = 300
55
- page_number = 0
56
- while page_number < pdf_total_pages:
57
-
58
- print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
59
- with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
60
-
61
- npage = 0
62
- while (npage < N_page and page_number < pdf_total_pages) :
63
-
64
- print(" >>> "+str(page_number+1))
65
- f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
66
- for char_pdf in pdf.pages[page_number].chars:
67
- f.write(char_pdf["text"])
68
- f.close()
69
-
70
- npage = npage + 1
71
- page_number = page_number + 1
72
-
73
-
74
- print(" X removing: " + blob.name )
75
- os.remove(pdf_folder+"/"+blob.name)
76
-
77
-
78
  def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
79
 
80
- vectorstore = PineconeVectorStore(
81
- index_name=index_name,
82
- embedding=embeddings_function,
83
- #namespace=namespace
84
- )
85
-
86
- return vectorstore
87
 
88
- print(" Vectorisation ...")
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  docs = []
91
- #vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
92
  for filename in os.listdir(folder_path):
93
  if filename.startswith("."):
94
  continue
@@ -99,17 +63,12 @@ def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vec
99
 
100
  for doc in documents:
101
  if (doc.metadata):
102
- doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
103
- doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
104
  doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
105
 
106
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
107
  docs += text_splitter.split_documents(documents)
108
- #vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
109
- vectorstore = PineconeVectorStore.from_documents(docs, embeddings_function, index_name=index_name)
110
- #vector_store_from_docs.save_local(vectors_path)
111
- return vectorstore
112
-
113
-
114
- print("MISSING VECTORS")
115
- exit(0)
 
1
+ # Pinecone
2
+ # More info at https://docs.pinecone.io/docs/langchain
3
+ # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
4
+ # import os
5
+ # from pinecone import Pinecone
6
+ # from langchain_community.vectorstores import Pinecone as PineconeVectorstore
7
 
8
+ # # LOAD ENVIRONMENT VARIABLES
9
+ # try:
10
+ # from dotenv import load_dotenv
11
+ # load_dotenv()
12
+ # except:
13
+ # pass
14
 
 
15
 
16
+ # def get_pinecone_vectorstore(embeddings,text_key = "content"):
 
 
 
17
 
18
+ # pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
19
+ # index = pc.Index(os.getenv("PINECONE_API_INDEX"))
20
 
21
+ # vectorstore = PineconeVectorstore(
22
+ # index, embeddings, text_key,
23
+ # )
24
+ # return vectorstore
25
 
26
 
27
+ from langchain_community.vectorstores import Annoy
28
+ from langchain_community.document_loaders import TextLoader
29
+ from langchain_text_splitters import CharacterTextSplitter
30
+ from climateqa.engine.embeddings import get_embeddings_function
31
+ embeddings_function = get_embeddings_function()
32
  import os
33
  import pdfplumber
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
36
 
37
+ if os.path.isfile(vectors_path+"/index.annoy"):
38
+ return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
 
 
 
 
 
39
 
40
+ # Extract text from PDF files
41
+ print("Extraction PDF ...")
42
+ for pdf_file in os.listdir(pdf_folder):
43
+ if pdf_file.startswith("."):
44
+ continue
45
+ print(" > "+pdf_folder+"/"+pdf_file)
46
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
47
+ for pdf_page in pdf.pages:
48
+ f = open(folder_path+"/"+pdf_file+" page "+str(pdf_page.page_number), "w")
49
+ # f.write(pdf_file+" page "+str(pdf_page.page_number))
50
+ for char_pdf in pdf_page.chars:
51
+ f.write(char_pdf["text"])
52
+ f.close()
53
 
54
  docs = []
55
+ vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
56
  for filename in os.listdir(folder_path):
57
  if filename.startswith("."):
58
  continue
 
63
 
64
  for doc in documents:
65
  if (doc.metadata):
66
+ doc.metadata["ax_page"] = doc.metadata['source'].split(" ")[-1]
67
+ doc.metadata["ax_name"] = doc.metadata['source'].split(" ")[0].split("/")[-1]
68
  doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
69
 
70
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
71
  docs += text_splitter.split_documents(documents)
72
+ vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
73
+ vector_store_from_docs.save_local(vectors_path)
74
+ return vector_store_from_docs
 
 
 
 
 
logs/1715672103.255797.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"user_id": "245d9442-2651-4578-8f63-4ed4145c0a40", "prompt": "quels risques physiques en 2024 ?", "query": "quels risques physiques en 2024 ?", "question": "What are the projected physical risks in 2024 related to climate change?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9, mais les extraits fournis ne contiennent pas d'informations sur les risques physiques projet\u00e9s en 2024 li\u00e9s au changement climatique. Je n'ai pas suffisamment d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715672103.255797"}
logs/1715673060.457813.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"user_id": "c936b5ac-affe-4df7-9f9c-6f78ae83525e", "prompt": "quels risques physiques ?", "query": "quels risques physiques ?", "question": "What are the physical risks associated with climate change?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9, mais les extraits fournis ne contiennent pas d'informations sur les risques physiques associ\u00e9s au changement climatique. Je n'ai pas assez d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715673060.457813"}
logs/1715673178.788617.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"user_id": "80fd8b5f-4cf2-430c-8c14-09b4acc7436c", "prompt": "quels risques physiques ?", "query": "quels risques physiques ?", "question": "What are the physical risks associated with climate change?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9e, mais les extraits fournis ne contiennent pas d'informations sur les risques physiques associ\u00e9s au changement climatique. Je n'ai pas suffisamment d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715673178.788617"}
logs/1715675920.752972.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"user_id": "d06c2c8a-ab31-4dcd-8d5c-69a4a678221b", "prompt": "quels sont les risques physiques en 2024 ?", "query": "quels sont les risques physiques en 2024 ?", "question": "What are the projected physical risks in 2024?", "sources": ["Custom"], "docs": [], "answer": "Je suis d\u00e9sol\u00e9e, mais les documents fournis ne contiennent pas d'informations sur les risques physiques projet\u00e9s en 2024. Je n'ai pas assez d'informations pour r\u00e9pondre \u00e0 cette question.", "time": "1715675920.752972"}
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- google-cloud-storage==2.16.0
2
  gradio==4.19.1
 
3
  python-dotenv==1.0.0
4
  langchain==0.1.10
5
  langchain_openai==0.0.6
@@ -11,5 +11,4 @@ pyalex==0.13
11
  networkx==3.2.1
12
  pyvis==0.3.2
13
  annoy==1.17.3
14
- langchain_pinecone
15
  pdfplumber
 
 
1
  gradio==4.19.1
2
+ gunicorn==22.0.0
3
  python-dotenv==1.0.0
4
  langchain==0.1.10
5
  langchain_openai==0.0.6
 
11
  networkx==3.2.1
12
  pyvis==0.3.2
13
  annoy==1.17.3
 
14
  pdfplumber
style.css CHANGED
@@ -3,78 +3,6 @@
3
  --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
  } */
5
 
6
- .logo-axio {
7
- float: right;
8
- position: absolute;
9
- right: 0px;
10
- }
11
-
12
-
13
- /* couleur text */
14
- p {
15
- color: black !important;
16
- }
17
- li {
18
- color: black !important;
19
- }
20
-
21
- button.selected {
22
- border-radius: 20px !important;
23
- }
24
- button:hover {
25
- color: #ffc000 !important;
26
- }
27
-
28
-
29
- /* fond panels/blocks */
30
- .panel {
31
- background-color: #eeeeee !important;
32
- border: 0px;
33
- }
34
- .block {
35
- background-color: #eeeeee !important;
36
- }
37
-
38
- /* fond bot */
39
- .bot {
40
- background-color: #eeeeee !important;
41
- }
42
-
43
- /* avatar en debut de reponse */
44
- .avatar-container {
45
- align-self: baseline !important;
46
- margin-top: 35px;
47
- }
48
-
49
-
50
-
51
- /* fond user */
52
- .user {
53
- background-color: #d2d2d2 !important;
54
- }
55
- textarea {
56
- background-color: #d2d2d2 !important;
57
- color: black !important;
58
- }
59
-
60
-
61
- /* fond app */
62
- gradio-app {
63
- background-color: #ffffff !important;
64
- }
65
- .gradio-container {
66
- background-color: #ffffff !important;
67
- max-width: 100% !important;
68
- width: 100% !important;
69
- }
70
-
71
-
72
- .a-propos {
73
- margin: 20px !important;
74
- }
75
-
76
-
77
-
78
  .telecharger {
79
  border: 1px solid;
80
  padding: 5px;
@@ -115,7 +43,7 @@ body.dark .warning-box * {
115
 
116
 
117
  body.dark .tip-box * {
118
- color:rgb(216, 216, 216) !important;
119
  }
120
 
121
 
 
3
  --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
  } */
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  .telecharger {
7
  border: 1px solid;
8
  padding: 5px;
 
43
 
44
 
45
  body.dark .tip-box * {
46
+ color:black !important;
47
  }
48
 
49
 
test CHANGED
@@ -19,7 +19,8 @@ ENV HOME=/home/user \
19
  GRADIO_NUM_PORTS=1 \
20
  GRADIO_SERVER_NAME=0.0.0.0 \
21
  GRADIO_THEME=huggingface \
22
- SYSTEM=spaces
 
23
 
24
  # Set the working directory to the user's home directory
25
  WORKDIR $HOME/app
@@ -27,6 +28,8 @@ WORKDIR $HOME/app
27
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
  COPY --chown=user . $HOME/app
29
 
30
- CMD ["python","setup.py"]
31
 
32
- CMD ["python", "app.py"]
 
 
 
19
  GRADIO_NUM_PORTS=1 \
20
  GRADIO_SERVER_NAME=0.0.0.0 \
21
  GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces \
23
+ PORT=7860
24
 
25
  # Set the working directory to the user's home directory
26
  WORKDIR $HOME/app
 
28
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
29
  COPY --chown=user . $HOME/app
30
 
31
+ #CMD ["python","setup.py"]
32
 
33
+ #CMD ["python", "app.py"]
34
+
35
+ CMD gunicorn -b 0.0.0.0:$PORT app:demo
vectors/index.annoy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b2b8ad228674a015ce9f4c7a4969e0267400140d450773f86802f50ac27ec45
3
  size 2238984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b94e9d486dbe3a9e2397672bda1d1c17198cca42a53afaa16ef8ecfcebd22fc9
3
  size 2238984
vectors/index.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5db10ad74c42238caac4b738e24f14f80b9688353f721397c3db34eb642ca3e2
3
  size 3223915
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eb3d63539603642200f07f8fac2e290e94104fbbe4f4471dc663eff850263f6
3
  size 3223915