Samiraxio commited on
Commit
ffe502f
1 Parent(s): 57a98f9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -37,3 +37,4 @@ PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-
37
  PDF/deu-2023.pdf filter=lfs diff=lfs merge=lfs -text
38
  PDF/memo_risques_physiques_focus_batiment_2022.pdf filter=lfs diff=lfs merge=lfs -text
39
  vectors/index.annoy filter=lfs diff=lfs merge=lfs -text
 
 
37
  PDF/deu-2023.pdf filter=lfs diff=lfs merge=lfs -text
38
  PDF/memo_risques_physiques_focus_batiment_2022.pdf filter=lfs diff=lfs merge=lfs -text
39
  vectors/index.annoy filter=lfs diff=lfs merge=lfs -text
40
+ sources/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -9,6 +9,9 @@ setAPIKEY.sh
9
  .AppleDouble
10
  .LSOverride
11
 
 
 
 
12
  # Icon must end with two \r
13
  Icon
14
 
 
9
  .AppleDouble
10
  .LSOverride
11
 
12
+ # Historique conversasion with chatbot
13
+ *.json
14
+
15
  # Icon must end with two \r
16
  Icon
17
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Clara
3
  app_file: app.py
4
  sdk: gradio
5
  sdk_version: 4.19.1
 
1
  ---
2
+ title: clara
3
  app_file: app.py
4
  sdk: gradio
5
  sdk_version: 4.19.1
app.py CHANGED
@@ -1,8 +1,7 @@
 
 
1
  # , get_pinecone_vectorstore, find_similar_vectors
2
- from climateqa.engine.vectorstore import build_vectores_stores
3
- from climateqa.engine.rag import make_rag_papers_chain
4
- from climateqa.engine.keywords import make_keywords_chain
5
- from climateqa.sample_questions import QUESTIONS
6
  from climateqa.engine.text_retriever import ClimateQARetriever
7
  from climateqa.engine.rag import make_rag_chain
8
  from climateqa.engine.llm import get_llm
@@ -11,11 +10,9 @@ from datetime import datetime
11
  import json
12
  import re
13
  import gradio as gr
14
- from climateqa.papers.openalex import OpenAlex
15
  from sentence_transformers import CrossEncoder
16
 
17
  reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
18
- oa = OpenAlex()
19
 
20
  # Load environment variables in local mode
21
  try:
@@ -25,9 +22,9 @@ except Exception as e:
25
  pass
26
 
27
  # Set up Gradio Theme
28
- theme = gr.themes.Base(
29
- primary_hue="blue",
30
- secondary_hue="red",
31
  font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
32
  "system-ui", "sans-serif"],
33
  )
@@ -163,7 +160,7 @@ async def chat(query, history):
163
  "answer": history[-1][1],
164
  "time": timestamp,
165
  }
166
- log_locally(log_file, logs)
167
 
168
  yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
169
 
@@ -181,7 +178,7 @@ def make_html_source(source, i):
181
  <div class="card-content">
182
  <div>
183
  <div style="float:right;width 10%;position:relative;top:0px">
184
- <a href='{meta['ax_url']}'><img style="width:20px" src='/file/assets/download.png' /></a>
185
  </div>
186
  <div>
187
  <h2>Extrait {i}</h2>
@@ -191,9 +188,9 @@ def make_html_source(source, i):
191
  <p>{text_content}</p>
192
 
193
  </div>
194
- <div class="card-footer">
195
  <span>{name}</span>
196
- </div>
197
  </div>
198
  """
199
 
@@ -209,79 +206,6 @@ def log_locally(file, logs):
209
  f.write(logs_json)
210
 
211
 
212
- def generate_keywords(query):
213
- chain = make_keywords_chain(llm)
214
- keywords = chain.invoke(query)
215
- keywords = " AND ".join(keywords["keywords"])
216
- return keywords
217
-
218
-
219
- papers_cols_widths = {
220
- "doc": 50,
221
- "id": 100,
222
- "title": 300,
223
- "doi": 100,
224
- "publication_year": 100,
225
- "abstract": 500,
226
- "rerank_score": 100,
227
- "is_oa": 50,
228
- }
229
-
230
- papers_cols = list(papers_cols_widths.keys())
231
- papers_cols_widths = list(papers_cols_widths.values())
232
-
233
-
234
- async def find_papers(query, keywords, after):
235
-
236
- summary = ""
237
-
238
- df_works = oa.search(keywords, after=after)
239
- df_works = df_works.dropna(subset=["abstract"])
240
- df_works = oa.rerank(query, df_works, reranker)
241
- df_works = df_works.sort_values("rerank_score", ascending=False)
242
- G = oa.make_network(df_works)
243
-
244
- height = "750px"
245
- network = oa.show_network(
246
- G, color_by="rerank_score", notebook=False, height=height)
247
- network_html = network.generate_html()
248
-
249
- network_html = network_html.replace("'", "\"")
250
- css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
251
- network_html = network_html + css_to_inject
252
-
253
- network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
254
- display-capture; encrypted-media;" sandbox="allow-modals allow-forms
255
- allow-scripts allow-same-origin allow-popups
256
- allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
257
- allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
258
-
259
- docs = df_works["content"].head(15).tolist()
260
-
261
- df_works = df_works.reset_index(
262
- drop=True).reset_index().rename(columns={"index": "doc"})
263
- df_works["doc"] = df_works["doc"] + 1
264
- df_works = df_works[papers_cols]
265
-
266
- yield df_works, network_html, summary
267
-
268
- chain = make_rag_papers_chain(llm)
269
- result = chain.astream_log(
270
- {"question": query, "docs": docs, "language": "English"})
271
- path_answer = "/logs/StrOutputParser/streamed_output/-"
272
-
273
- async for op in result:
274
-
275
- op = op.ops[0]
276
-
277
- if op['path'] == path_answer: # reforulated question
278
- new_token = op['value'] # str
279
- summary += new_token
280
- else:
281
- continue
282
- yield df_works, network_html, summary
283
-
284
-
285
  # --------------------------------------------------------------------
286
  # Gradio
287
  # --------------------------------------------------------------------
@@ -302,8 +226,13 @@ What would you like to know today?
302
  """
303
 
304
 
305
- with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component") as demo:
306
 
 
 
 
 
 
307
  with gr.Tab("CLARA"):
308
 
309
  with gr.Row(elem_id="chatbot-row"):
@@ -315,57 +244,44 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
315
 
316
  with gr.Row(elem_id="input-message"):
317
  textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
318
- scale=7, lines=1, interactive=True, elem_id="input-textbox")
319
 
320
- with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
321
 
322
- with gr.Tabs() as tabs:
323
 
324
- with gr.Tab("Sources", elem_id="tab-citations", id=1):
325
- sources_textbox = gr.HTML(
326
- show_label=False, elem_id="sources-textbox")
327
- docs_textbox = gr.State("")
328
 
329
- # ---------------------------------------------------------------------------------------
330
- # OTHER TABS
331
- # ---------------------------------------------------------------------------------------
332
 
333
- with gr.Tab("Figures", elem_id="tab-images", elem_classes="max-height other-tabs"):
334
- gallery_component = gr.Gallery()
 
 
 
335
 
336
- with gr.Tab("Papers (beta)", elem_id="tab-papers", elem_classes="max-height other-tabs"):
337
 
 
 
338
  with gr.Row():
339
  with gr.Column(scale=1):
340
- query_papers = gr.Textbox(
341
- placeholder="Question", show_label=False, lines=1, interactive=True, elem_id="query-papers")
342
- keywords_papers = gr.Textbox(
343
- placeholder="Keywords", show_label=False, lines=1, interactive=True, elem_id="keywords-papers")
344
- after = gr.Slider(minimum=1950, maximum=2023, step=1, value=1960,
345
- label="Publication date", show_label=True, interactive=True, elem_id="date-papers")
346
- search_papers = gr.Button(
347
- "Search", elem_id="search-papers", interactive=True)
348
-
349
- with gr.Column(scale=7):
350
-
351
- with gr.Tab("Summary", elem_id="papers-summary-tab"):
352
- papers_summary = gr.Markdown(
353
- visible=True, elem_id="papers-summary")
354
 
355
- with gr.Tab("Relevant papers", elem_id="papers-results-tab"):
356
- papers_dataframe = gr.Dataframe(
357
- visible=True, elem_id="papers-table", headers=papers_cols)
358
 
359
- with gr.Tab("Citations network", elem_id="papers-network-tab"):
360
- citations_network = gr.HTML(
361
- visible=True, elem_id="papers-citations-network")
 
 
 
 
 
362
 
363
- with gr.Tab("À propos", elem_classes="max-height other-tabs"):
364
- with gr.Row():
365
- with gr.Column(scale=1):
366
- gr.Markdown(
367
- "CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
368
- "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)")
369
 
370
  def start_chat(query, history):
371
  history = history + [(query, None)]
@@ -382,21 +298,8 @@ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-compon
382
  )
383
 
384
 
385
-
386
- def change_sample_questions(key):
387
- index = list(QUESTIONS.keys()).index(key)
388
- visible_bools = [False] * len(samples)
389
- visible_bools[index] = True
390
- return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
391
-
392
- # dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
393
-
394
- query_papers.submit(generate_keywords, [query_papers], [keywords_papers])
395
- search_papers.click(find_papers, [query_papers, keywords_papers, after], [
396
- papers_dataframe, citations_network, papers_summary])
397
-
398
  demo.queue()
399
 
400
  demo.launch(allowed_paths=["assets/download.png",
401
- "assets/logo4.png"],
402
- favicon_path="assets/logo4.png")
 
1
+
2
+
3
  # , get_pinecone_vectorstore, find_similar_vectors
4
+ from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP
 
 
 
5
  from climateqa.engine.text_retriever import ClimateQARetriever
6
  from climateqa.engine.rag import make_rag_chain
7
  from climateqa.engine.llm import get_llm
 
10
  import json
11
  import re
12
  import gradio as gr
 
13
  from sentence_transformers import CrossEncoder
14
 
15
  reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 
16
 
17
  # Load environment variables in local mode
18
  try:
 
22
  pass
23
 
24
  # Set up Gradio Theme
25
+ theme = gr.themes.Soft(
26
+ primary_hue="yellow",
27
+ secondary_hue="orange",
28
  font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
29
  "system-ui", "sans-serif"],
30
  )
 
160
  "answer": history[-1][1],
161
  "time": timestamp,
162
  }
163
+ #log_locally(log_file, logs)
164
 
165
  yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
166
 
 
178
  <div class="card-content">
179
  <div>
180
  <div style="float:right;width 10%;position:relative;top:0px">
181
+ <a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
182
  </div>
183
  <div>
184
  <h2>Extrait {i}</h2>
 
188
  <p>{text_content}</p>
189
 
190
  </div>
191
+ <!-- <div class="card-footer">
192
  <span>{name}</span>
193
+ </div> -->
194
  </div>
195
  """
196
 
 
206
  f.write(logs_json)
207
 
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  # --------------------------------------------------------------------
210
  # Gradio
211
  # --------------------------------------------------------------------
 
226
  """
227
 
228
 
229
+ with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
230
 
231
+ gr.HTML("""
232
+ <img style="width:100px" src="file/assets/axionable.svg"/>
233
+ """, elem_classes="logo-axio ")
234
+
235
+ # TAB Clara
236
  with gr.Tab("CLARA"):
237
 
238
  with gr.Row(elem_id="chatbot-row"):
 
244
 
245
  with gr.Row(elem_id="input-message"):
246
  textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
247
+ scale=7, lines=1, interactive=True, elem_id="input-textbox")
248
 
 
249
 
250
+ with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
251
 
252
+ with gr.Column(scale=1, elem_id="tab-citations"):
253
+
254
+ gr.HTML("<p>Sources</p>")
 
255
 
256
+ sources_textbox = gr.HTML(
257
+ show_label=False, elem_id="sources-textbox")
258
+ docs_textbox = gr.State("")
259
 
260
+ # l'object tabs est necessaire actuellement
261
+ # J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
262
+ # pendant que l'ia gènère une reponse ..
263
+ with gr.Tabs() as tabs:
264
+ None
265
 
 
266
 
267
+ # TAB A propos
268
+ with gr.Tab("À propos", elem_classes="max-height other-tabs"):
269
  with gr.Row():
270
  with gr.Column(scale=1):
271
+ gr.Markdown(
272
+ ("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
273
+ "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
 
 
 
 
 
 
 
 
 
 
 
274
 
 
 
 
275
 
276
+ # # TAB Configuration
277
+ # with gr.Tab("Configuration"):
278
+ #
279
+ # with gr.Row(elem_id="config-row"):
280
+ # with gr.Column(scale=1):
281
+ #
282
+ # for pdfName in get_PDF_Names_from_GCP():
283
+ # gr.Markdown( pdfName, elem_classes="a-propos")
284
 
 
 
 
 
 
 
285
 
286
  def start_chat(query, history):
287
  history = history + [(query, None)]
 
298
  )
299
 
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  demo.queue()
302
 
303
  demo.launch(allowed_paths=["assets/download.png",
304
+ "assets/logo4.png",
305
+ "assets/axionable.svg"],favicon_path="assets/logo4.png")
assets/axionable.svg ADDED
climateqa/engine/embeddings.py CHANGED
@@ -8,8 +8,12 @@ def get_embeddings_function(version = "v1.2"):
8
 
9
  # https://huggingface.co/BAAI/bge-base-en-v1.5
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
-
12
- model_name = "BAAI/bge-base-en-v1.5"
 
 
 
 
13
  encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
14
  print("Loading embeddings model: ", model_name)
15
  embeddings_function = HuggingFaceBgeEmbeddings(
 
8
 
9
  # https://huggingface.co/BAAI/bge-base-en-v1.5
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
+ # model_name = "BAAI/bge-base-en-v1.5"
12
+
13
+ # https://huggingface.co/BAAI/bge-m3
14
+ # A better one from 2024-04
15
+ model_name = "BAAI/bge-m3"
16
+
17
  encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
18
  print("Loading embeddings model: ", model_name)
19
  embeddings_function = HuggingFaceBgeEmbeddings(
climateqa/engine/vectorstore.py CHANGED
@@ -1,98 +1,94 @@
1
- # Pinecone
2
- # More info at https://docs.pinecone.io/docs/langchain
3
- # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
4
- #import os
5
- #from pinecone import Pinecone
6
- #from langchain_community.vectorstores import Pinecone as PineconeVectorstore
7
 
8
- # LOAD ENVIRONMENT VARIABLES
9
- #try:
10
- # from dotenv import load_dotenv
11
- # load_dotenv()
12
- #except:
13
- # pass
14
 
 
15
 
16
- #def get_pinecone_vectorstore(embeddings,text_key = "content"):
 
 
 
17
 
18
- # # initialize pinecone
19
- # pinecone.init(
20
- # api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
21
- # environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
22
- # )
23
 
24
- # index_name = os.getenv("PINECONE_API_INDEX")
25
- # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
26
 
27
- # return vectorstore
 
28
 
29
- # pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
30
- # index = pc.Index(os.getenv("PINECONE_API_INDEX"))
31
 
32
- # vectorstore = PineconeVectorstore(
33
- # index, embeddings, text_key,
34
- # )
35
- # return vectorstore
36
 
 
37
 
 
 
 
 
 
 
38
 
39
- # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
40
 
41
- # assert isinstance(sources,list)
 
 
42
 
43
- # # Check if all elements in the list are either IPCC or IPBES
44
- # filter = {
45
- # "source": { "$in":sources},
46
- # }
47
 
48
- # retriever = vectorstore.as_retriever(search_kwargs={
49
- # "k": k,
50
- # "namespace":"vectors",
51
- # "filter":filter
52
- # })
 
 
 
 
 
 
 
 
 
 
53
 
54
- # return retriever
 
55
 
56
- from langchain_community.vectorstores import Annoy
57
- from langchain_community.document_loaders import TextLoader
58
- from langchain_text_splitters import CharacterTextSplitter
59
- from climateqa.engine.embeddings import get_embeddings_function
60
- embeddings_function = get_embeddings_function()
61
 
62
- #def build_vectores_stores(content_path):
63
- # loader = TextLoader(content_path)
64
- # documents = loader.load()
65
- # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
66
- # docs = text_splitter.split_documents(documents)
67
 
68
- # vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
69
- # return vector_store_from_docs
70
 
71
 
72
- import os
73
- import pdfplumber
 
74
 
75
  def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
76
-
77
- if os.path.isfile(vectors_path+"/index.annoy"):
78
- return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
79
-
80
- # Extract text from PDF files
81
- print("Extraction PDF ...")
82
- for pdf_file in os.listdir(pdf_folder):
83
- if pdf_file.startswith("."):
84
- continue
85
- print(" > "+pdf_folder+"/"+pdf_file)
86
- with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
87
- for pdf_page in pdf.pages:
88
- f = open(folder_path+"/"+pdf_file+" page "+str(pdf_page.page_number), "w")
89
- # f.write(pdf_file+" page "+str(pdf_page.page_number))
90
- for char_pdf in pdf_page.chars:
91
- f.write(char_pdf["text"])
92
- f.close()
93
 
94
  docs = []
95
- vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
96
  for filename in os.listdir(folder_path):
97
  if filename.startswith("."):
98
  continue
@@ -103,12 +99,17 @@ def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vec
103
 
104
  for doc in documents:
105
  if (doc.metadata):
106
- doc.metadata["ax_page"] = doc.metadata['source'].split(" ")[-1]
107
- doc.metadata["ax_name"] = doc.metadata['source'].split(" ")[0].split("/")[-1]
108
  doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
109
 
110
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
111
  docs += text_splitter.split_documents(documents)
112
- vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
113
- vector_store_from_docs.save_local(vectors_path)
114
- return vector_store_from_docs
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ from google.cloud import storage
3
+ #storage_client = storage.Client()
4
+ storage_client = storage.Client.create_anonymous_client()
5
+ bucket_name = "docs-axio-clara"
 
 
6
 
7
+ from langchain_pinecone import PineconeVectorStore
8
 
9
+ from langchain_community.document_loaders import TextLoader
10
+ from langchain_text_splitters import CharacterTextSplitter
11
+ from climateqa.engine.embeddings import get_embeddings_function
12
+ embeddings_function = get_embeddings_function()
13
 
 
 
 
 
 
14
 
 
 
15
 
16
+ index_name = "my-index"
17
+ namespace = "my-namespace"
18
 
 
 
19
 
20
+ import os
21
+ import pdfplumber
 
 
22
 
23
+ def get_PDF_Names_from_GCP():
24
 
25
+ listName = []
26
+ # Récupération des fichier depuis GCP storage
27
+ blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
28
+ for blob in blobs:
29
+ listName.append(blob.name)
30
+ return listName
31
 
32
+ def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
33
 
34
+ # Récupération des fichier depuis GCP storage
35
+ blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
36
+ for blob in blobs:
37
 
38
+ print( "\n"+blob.name+":")
39
+ print( " <- Téléchargement Depuis GCP")
40
+ blob.download_to_filename(pdf_folder+"/"+blob.name)
 
41
 
42
+ # Extraction des textes dpuis les fichiers PDF
43
+ print(" >>> Extraction PDF")
44
+ for pdf_file in os.listdir(pdf_folder):
45
+ if pdf_file.startswith("."):
46
+ continue
47
+ print(" > "+pdf_folder+"/"+pdf_file)
48
+ pdf_total_pages = 0
49
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
50
+ pdf_total_pages = len(pdf.pages)
51
+
52
+ # Fuite mémoire pour les gros fichiers
53
+ # Reouvrir le fichier à chaque N page semble rélgler le problème
54
+ N_page = 300
55
+ page_number = 0
56
+ while page_number < pdf_total_pages:
57
 
58
+ print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
59
+ with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
60
 
61
+ npage = 0
62
+ while (npage < N_page and page_number < pdf_total_pages) :
 
 
 
63
 
64
+ print(" >>> "+str(page_number+1))
65
+ f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
66
+ for char_pdf in pdf.pages[page_number].chars:
67
+ f.write(char_pdf["text"])
68
+ f.close()
69
 
70
+ npage = npage + 1
71
+ page_number = page_number + 1
72
 
73
 
74
+ print(" X removing: " + blob.name )
75
+ os.remove(pdf_folder+"/"+blob.name)
76
+
77
 
78
  def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
79
+
80
+ vectorstore = PineconeVectorStore(
81
+ index_name=index_name,
82
+ embedding=embeddings_function,
83
+ #namespace=namespace
84
+ )
85
+
86
+ return vectorstore
87
+
88
+ print(" Vectorisation ...")
 
 
 
 
 
 
 
89
 
90
  docs = []
91
+ #vector_store_from_docs = () # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
92
  for filename in os.listdir(folder_path):
93
  if filename.startswith("."):
94
  continue
 
99
 
100
  for doc in documents:
101
  if (doc.metadata):
102
+ doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
103
+ doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
104
  doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
105
 
106
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
107
  docs += text_splitter.split_documents(documents)
108
+ #vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
109
+ vectorstore = PineconeVectorStore.from_documents(docs, embeddings_function, index_name=index_name)
110
+ #vector_store_from_docs.save_local(vectors_path)
111
+ return vectorstore
112
+
113
+
114
+ print("MISSING VECTORS")
115
+ exit(0)
requirements.txt CHANGED
@@ -1,5 +1,5 @@
 
1
  gradio==4.19.1
2
- gunicorn==22.0.0
3
  python-dotenv==1.0.0
4
  langchain==0.1.10
5
  langchain_openai==0.0.6
@@ -10,5 +10,4 @@ msal
10
  pyalex==0.13
11
  networkx==3.2.1
12
  pyvis==0.3.2
13
- annoy==1.17.3
14
- pdfplumber
 
1
+ google-cloud-storage==2.16.0
2
  gradio==4.19.1
 
3
  python-dotenv==1.0.0
4
  langchain==0.1.10
5
  langchain_openai==0.0.6
 
10
  pyalex==0.13
11
  networkx==3.2.1
12
  pyvis==0.3.2
13
+ annoy==1.17.3
 
sources/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9d2d29a6545fc1949b10eb8428e6fac632aa84020fa61f4f76600817a21cd5
3
+ size 2079496
style.css CHANGED
@@ -3,6 +3,78 @@
3
  --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
  } */
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  .telecharger {
7
  border: 1px solid;
8
  padding: 5px;
@@ -43,7 +115,7 @@ body.dark .warning-box * {
43
 
44
 
45
  body.dark .tip-box * {
46
- color:black !important;
47
  }
48
 
49
 
 
3
  --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
  } */
5
 
6
+ .logo-axio {
7
+ float: right;
8
+ position: absolute;
9
+ right: 0px;
10
+ }
11
+
12
+
13
+ /* couleur text */
14
+ p {
15
+ color: black !important;
16
+ }
17
+ li {
18
+ color: black !important;
19
+ }
20
+
21
+ button.selected {
22
+ border-radius: 20px !important;
23
+ }
24
+ button:hover {
25
+ color: #ffc000 !important;
26
+ }
27
+
28
+
29
+ /* fond panels/blocks */
30
+ .panel {
31
+ background-color: #eeeeee !important;
32
+ border: 0px;
33
+ }
34
+ .block {
35
+ background-color: #eeeeee !important;
36
+ }
37
+
38
+ /* fond bot */
39
+ .bot {
40
+ background-color: #eeeeee !important;
41
+ }
42
+
43
+ /* avatar en debut de reponse */
44
+ .avatar-container {
45
+ align-self: baseline !important;
46
+ margin-top: 35px;
47
+ }
48
+
49
+
50
+
51
+ /* fond user */
52
+ .user {
53
+ background-color: #d2d2d2 !important;
54
+ }
55
+ textarea {
56
+ background-color: #d2d2d2 !important;
57
+ color: black !important;
58
+ }
59
+
60
+
61
+ /* fond app */
62
+ gradio-app {
63
+ background-color: #ffffff !important;
64
+ }
65
+ .gradio-container {
66
+ background-color: #ffffff !important;
67
+ max-width: 100% !important;
68
+ width: 100% !important;
69
+ }
70
+
71
+
72
+ .a-propos {
73
+ margin: 20px !important;
74
+ }
75
+
76
+
77
+
78
  .telecharger {
79
  border: 1px solid;
80
  padding: 5px;
 
115
 
116
 
117
  body.dark .tip-box * {
118
+ color:rgb(216, 216, 216) !important;
119
  }
120
 
121
 
test CHANGED
@@ -19,8 +19,7 @@ ENV HOME=/home/user \
19
  GRADIO_NUM_PORTS=1 \
20
  GRADIO_SERVER_NAME=0.0.0.0 \
21
  GRADIO_THEME=huggingface \
22
- SYSTEM=spaces \
23
- PORT=7860
24
 
25
  # Set the working directory to the user's home directory
26
  WORKDIR $HOME/app
@@ -28,8 +27,6 @@ WORKDIR $HOME/app
28
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
29
  COPY --chown=user . $HOME/app
30
 
31
- #CMD ["python","setup.py"]
32
 
33
- #CMD ["python", "app.py"]
34
-
35
- CMD gunicorn -b 0.0.0.0:$PORT app:demo
 
19
  GRADIO_NUM_PORTS=1 \
20
  GRADIO_SERVER_NAME=0.0.0.0 \
21
  GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces
 
23
 
24
  # Set the working directory to the user's home directory
25
  WORKDIR $HOME/app
 
27
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
  COPY --chown=user . $HOME/app
29
 
30
+ CMD ["python","setup.py"]
31
 
32
+ CMD ["python", "app.py"]