LOUIS SANNA commited on
Commit
35c9187
1 Parent(s): 6d2199d

feat(loggign)

Browse files
Files changed (4) hide show
  1. .vscode/settings.json +3 -0
  2. app.py +202 -358
  3. climateqa/logging.py +70 -0
  4. climateqa/vectorstore.py +0 -18
.vscode/settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "python.pythonPath": "/Users/louissanna/opt/anaconda3/envs/anything-question-answering/bin/python"
3
+ }
app.py CHANGED
@@ -1,21 +1,16 @@
1
  import gradio as gr
2
- import pandas as pd
3
- import numpy as np
4
- import os
5
- from datetime import datetime
6
 
7
  from utils import create_user_id
8
 
9
- from azure.storage.fileshare import ShareServiceClient
10
 
11
  # Langchain
12
  from langchain.embeddings import HuggingFaceEmbeddings
13
- from langchain.schema import AIMessage, HumanMessage
14
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
15
 
16
  # ClimateQ&A imports
17
  from climateqa.llm import get_llm
18
- from climateqa.chains import load_qa_chain_with_docs,load_qa_chain_with_text
 
19
  from climateqa.chains import load_reformulation_chain
20
  from climateqa.vectorstore import get_pinecone_vectorstore
21
  from climateqa.retriever import ClimateQARetriever
@@ -24,6 +19,7 @@ from climateqa.prompts import audience_prompts
24
  # Load environment variables in local mode
25
  try:
26
  from dotenv import load_dotenv
 
27
  load_dotenv()
28
  except Exception as e:
29
  pass
@@ -36,7 +32,6 @@ theme = gr.themes.Base(
36
  )
37
 
38
 
39
-
40
  init_prompt = ""
41
 
42
  system_template = {
@@ -44,47 +39,40 @@ system_template = {
44
  "content": init_prompt,
45
  }
46
 
47
- account_key = os.environ["BLOB_ACCOUNT_KEY"]
48
- if len(account_key) == 86:
49
- account_key += "=="
50
-
51
- credential = {
52
- "account_key": account_key,
53
- "account_name": os.environ["BLOB_ACCOUNT_NAME"],
54
- }
55
-
56
- account_url = os.environ["BLOB_ACCOUNT_URL"]
57
- file_share_name = "climategpt"
58
- service = ShareServiceClient(account_url=account_url, credential=credential)
59
- share_client = service.get_share_client(file_share_name)
60
-
61
  user_id = create_user_id()
62
 
63
- #---------------------------------------------------------------------------
64
  # ClimateQ&A core functions
65
- #---------------------------------------------------------------------------
66
 
67
  from langchain.callbacks.base import BaseCallbackHandler
68
  from queue import Queue, Empty
69
  from threading import Thread
70
  from collections.abc import Generator
71
  from langchain.schema import LLMResult
72
- from typing import Any, Union,Dict,List
73
  from queue import SimpleQueue
 
74
  # # Create a Queue
75
  # Q = Queue()
76
 
77
  import re
78
 
 
79
  def parse_output_llm_with_sources(output):
80
  # Split the content into a list of text and "[Doc X]" references
81
- content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
82
  parts = []
83
  for part in content_parts:
84
  if part.startswith("Doc"):
85
  subparts = part.split(",")
86
- subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
87
- subparts = [f"<span class='doc-ref'><sup>{subpart}</sup></span>" for subpart in subparts]
 
 
 
 
 
88
  parts.append("".join(subparts))
89
  else:
90
  parts.append(part)
@@ -92,8 +80,7 @@ def parse_output_llm_with_sources(output):
92
  return content_parts
93
 
94
 
95
-
96
- job_done = object() # signals the processing is done
97
 
98
 
99
  class StreamingGradioCallbackHandler(BaseCallbackHandler):
@@ -125,45 +112,49 @@ class StreamingGradioCallbackHandler(BaseCallbackHandler):
125
  self.q.put(job_done)
126
 
127
 
128
-
129
-
130
  # Create embeddings function and LLM
131
- embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
 
 
132
 
133
 
134
  # Create vectorstore and retriever
135
  vectorstore = get_pinecone_vectorstore(embeddings_function)
136
 
137
- #---------------------------------------------------------------------------
138
  # ClimateQ&A Streaming
139
  # From https://github.com/gradio-app/gradio/issues/5345
140
  # And https://stackoverflow.com/questions/76057076/how-to-stream-agents-response-in-langchain
141
- #---------------------------------------------------------------------------
142
 
143
  from threading import Thread
144
 
145
- import json
146
 
147
- def answer_user(query,query_example,history):
148
  if len(query) <= 2:
149
  raise Exception("Please ask a longer question")
150
  return query, history + [[query, ". . ."]]
151
 
152
- def answer_user_example(query,query_example,history):
 
153
  return query_example, history + [[query_example, ". . ."]]
154
 
155
- def fetch_sources(query,sources):
156
 
 
157
  # Prepare default values
158
  if len(sources) == 0:
159
  sources = ["IPCC"]
160
 
161
- llm_reformulation = get_llm(max_tokens = 512,temperature = 0.0,verbose = True,streaming = False)
162
- retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,k_summary = 3,k_total = 10)
 
 
 
 
163
  reformulation_chain = load_reformulation_chain(llm_reformulation)
164
 
165
  # Calculate language
166
- output_reformulation = reformulation_chain({"query":query})
167
  question = output_reformulation["question"]
168
  language = output_reformulation["language"]
169
 
@@ -171,23 +162,23 @@ def fetch_sources(query,sources):
171
  docs = retriever.get_relevant_documents(question)
172
 
173
  if len(docs) > 0:
174
-
175
  # Already display the sources
176
  sources_text = []
177
  for i, d in enumerate(docs, 1):
178
  sources_text.append(make_html_source(d, i))
179
  citations_text = "".join(sources_text)
180
  docs_text = "\n\n".join([d.page_content for d in docs])
181
- return "",citations_text,docs_text,question,language
182
  else:
183
- sources_text = "⚠️ No relevant passages found in the scientific reports (IPCC and IPBES)"
 
 
184
  citations_text = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
185
  docs_text = ""
186
- return "",citations_text,docs_text,question,language
187
-
188
 
189
- def answer_bot(query,history,docs,question,language,audience):
190
 
 
191
  if audience == "Children":
192
  audience_prompt = audience_prompts["children"]
193
  elif audience == "General public":
@@ -200,36 +191,52 @@ def answer_bot(query,history,docs,question,language,audience):
200
  # Prepare Queue for streaming LLMs
201
  Q = SimpleQueue()
202
 
203
- llm_streaming = get_llm(max_tokens = 1024,temperature = 0.0,verbose = True,streaming = True,
204
- callbacks=[StreamingGradioCallbackHandler(Q),StreamingStdOutCallbackHandler()],
 
 
 
 
205
  )
206
 
207
  qa_chain = load_qa_chain_with_text(llm_streaming)
208
 
209
- def threaded_chain(question,audience,language,docs):
210
  try:
211
- response = qa_chain({"question":question,"audience":audience,"language":language,"summaries":docs})
 
 
 
 
 
 
 
212
  Q.put(response)
213
  Q.put(job_done)
214
  except Exception as e:
215
  print(e)
216
-
217
  history[-1][1] = ""
218
-
219
- textbox=gr.Textbox(placeholder=". . .",show_label=False,scale=1,lines = 1,interactive = False)
220
 
 
 
 
221
 
222
  if len(docs) > 0:
223
-
224
  # Start thread for streaming
225
  thread = Thread(
226
- target=threaded_chain,
227
- kwargs={"question":question,"audience":audience_prompt,"language":language,"docs":docs}
 
 
 
 
 
228
  )
229
  thread.start()
230
 
231
  while True:
232
- next_item = Q.get(block=True) # Blocks until an input is available
233
 
234
  if next_item is job_done:
235
  break
@@ -237,88 +244,27 @@ def answer_bot(query,history,docs,question,language,audience):
237
  new_paragraph = history[-1][1] + next_item
238
  new_paragraph = parse_output_llm_with_sources(new_paragraph)
239
  history[-1][1] = new_paragraph
240
- yield textbox,history
241
  else:
242
  pass
243
  thread.join()
244
 
245
- # Log answer on Azure Blob Storage
246
- timestamp = str(datetime.now().timestamp())
247
- file = timestamp + ".json"
248
- prompt = history[-1][0]
249
- logs = {
250
- "user_id": str(user_id),
251
- "prompt": prompt,
252
- "query": prompt,
253
- "question":question,
254
- "docs":docs,
255
- "answer": history[-1][1],
256
- "time": timestamp,
257
- }
258
- log_on_azure(file, logs, share_client)
259
-
260
-
261
 
262
  else:
263
  complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
264
  history[-1][1] += complete_response
265
- yield "",history
266
-
267
-
268
-
269
- # history_langchain_format = []
270
- # for human, ai in history:
271
- # history_langchain_format.append(HumanMessage(content=human))
272
- # history_langchain_format.append(AIMessage(content=ai))
273
- # history_langchain_format.append(HumanMessage(content=message)
274
- # for next_token, content in stream(message):
275
- # yield(content)
276
-
277
- # thread = Thread(target=threaded_chain, kwargs={"query":message,"audience":audience_prompt})
278
- # thread.start()
279
-
280
- # history[-1][1] = ""
281
- # while True:
282
- # next_item = Q.get(block=True) # Blocks until an input is available
283
-
284
- # print(type(next_item))
285
- # if next_item is job_done:
286
- # continue
287
-
288
- # elif isinstance(next_item, dict): # assuming LLMResult is a dictionary
289
- # response = next_item
290
- # if "source_documents" in response and len(response["source_documents"]) > 0:
291
- # sources_text = []
292
- # for i, d in enumerate(response["source_documents"], 1):
293
- # sources_text.append(make_html_source(d, i))
294
- # sources_text = "\n\n".join([f"Query used for retrieval:\n{response['question']}"] + sources_text)
295
- # # history[-1][1] += next_item["answer"]
296
- # # history[-1][1] += "\n\n" + sources_text
297
- # yield "", history, sources_text
298
-
299
- # else:
300
- # sources_text = "⚠️ No relevant passages found in the scientific reports (IPCC and IPBES)"
301
- # complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
302
- # history[-1][1] += "\n\n" + complete_response
303
- # yield "", history, sources_text
304
- # break
305
-
306
- # elif isinstance(next_item, str):
307
- # new_paragraph = history[-1][1] + next_item
308
- # new_paragraph = parse_output_llm_with_sources(new_paragraph)
309
- # history[-1][1] = new_paragraph
310
- # yield "", history, ""
311
-
312
- # thread.join()
313
-
314
- #---------------------------------------------------------------------------
315
  # ClimateQ&A core functions
316
- #---------------------------------------------------------------------------
317
 
318
 
319
- def make_html_source(source,i):
320
  meta = source.metadata
321
- content = source.page_content.split(":",1)[1].strip()
322
  return f"""
323
  <div class="card">
324
  <div class="card-content">
@@ -335,144 +281,9 @@ def make_html_source(source,i):
335
  """
336
 
337
 
338
-
339
- # def chat(
340
- # user_id: str,
341
- # query: str,
342
- # history: list = [system_template],
343
- # report_type: str = "IPCC",
344
- # threshold: float = 0.555,
345
- # ) -> tuple:
346
- # """retrieve relevant documents in the document store then query gpt-turbo
347
-
348
- # Args:
349
- # query (str): user message.
350
- # history (list, optional): history of the conversation. Defaults to [system_template].
351
- # report_type (str, optional): should be "All available" or "IPCC only". Defaults to "All available".
352
- # threshold (float, optional): similarity threshold, don't increase more than 0.568. Defaults to 0.56.
353
-
354
- # Yields:
355
- # tuple: chat gradio format, chat openai format, sources used.
356
- # """
357
-
358
- # if report_type not in ["IPCC","IPBES"]: report_type = "all"
359
- # print("Searching in ",report_type," reports")
360
- # # if report_type == "All available":
361
- # # retriever = retrieve_all
362
- # # elif report_type == "IPCC only":
363
- # # retriever = retrieve_giec
364
- # # else:
365
- # # raise Exception("report_type arg should be in (All available, IPCC only)")
366
-
367
- # reformulated_query = openai.Completion.create(
368
- # engine="EkiGPT",
369
- # prompt=get_reformulation_prompt(query),
370
- # temperature=0,
371
- # max_tokens=128,
372
- # stop=["\n---\n", "<|im_end|>"],
373
- # )
374
- # reformulated_query = reformulated_query["choices"][0]["text"]
375
- # reformulated_query, language = reformulated_query.split("\n")
376
- # language = language.split(":")[1].strip()
377
-
378
-
379
- # sources = retrieve_with_summaries(reformulated_query,retriever,k_total = 10,k_summary = 3,as_dict = True,source = report_type.lower(),threshold = threshold)
380
- # response_retriever = {
381
- # "language":language,
382
- # "reformulated_query":reformulated_query,
383
- # "query":query,
384
- # "sources":sources,
385
- # }
386
-
387
- # # docs = [d for d in retriever.retrieve(query=reformulated_query, top_k=10) if d.score > threshold]
388
- # messages = history + [{"role": "user", "content": query}]
389
-
390
- # if len(sources) > 0:
391
- # docs_string = []
392
- # docs_html = []
393
- # for i, d in enumerate(sources, 1):
394
- # docs_string.append(f"📃 Doc {i}: {d['meta']['short_name']} page {d['meta']['page_number']}\n{d['content']}")
395
- # docs_html.append(make_html_source(d,i))
396
- # docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
397
- # docs_html = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_html)
398
- # messages.append({"role": "system", "content": f"{sources_prompt}\n\n{docs_string}\n\nAnswer in {language}:"})
399
-
400
-
401
- # response = openai.Completion.create(
402
- # engine="EkiGPT",
403
- # prompt=to_completion(messages),
404
- # temperature=0, # deterministic
405
- # stream=True,
406
- # max_tokens=1024,
407
- # )
408
-
409
- # complete_response = ""
410
- # messages.pop()
411
-
412
- # messages.append({"role": "assistant", "content": complete_response})
413
- # timestamp = str(datetime.now().timestamp())
414
- # file = user_id + timestamp + ".json"
415
- # logs = {
416
- # "user_id": user_id,
417
- # "prompt": query,
418
- # "retrived": sources,
419
- # "report_type": report_type,
420
- # "prompt_eng": messages[0],
421
- # "answer": messages[-1]["content"],
422
- # "time": timestamp,
423
- # }
424
- # log_on_azure(file, logs, share_client)
425
-
426
- # for chunk in response:
427
- # if (chunk_message := chunk["choices"][0].get("text")) and chunk_message != "<|im_end|>":
428
- # complete_response += chunk_message
429
- # messages[-1]["content"] = complete_response
430
- # gradio_format = make_pairs([a["content"] for a in messages[1:]])
431
- # yield gradio_format, messages, docs_html
432
-
433
- # else:
434
- # docs_string = "⚠️ No relevant passages found in the climate science reports (IPCC and IPBES)"
435
- # complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
436
- # messages.append({"role": "assistant", "content": complete_response})
437
- # gradio_format = make_pairs([a["content"] for a in messages[1:]])
438
- # yield gradio_format, messages, docs_string
439
-
440
-
441
- def save_feedback(feed: str, user_id):
442
- if len(feed) > 1:
443
- timestamp = str(datetime.now().timestamp())
444
- file = user_id + timestamp + ".json"
445
- logs = {
446
- "user_id": user_id,
447
- "feedback": feed,
448
- "time": timestamp,
449
- }
450
- log_on_azure(file, logs, share_client)
451
- return "Feedback submitted, thank you!"
452
-
453
-
454
  def reset_textbox():
455
  return gr.update(value="")
456
 
457
- import json
458
-
459
- def log_on_azure(file, logs, share_client):
460
- logs = json.dumps(logs)
461
- print(type(logs))
462
- file_client = share_client.get_file_client(file)
463
- print("Uploading logs to Azure Blob Storage")
464
- print("----------------------------------")
465
- print("")
466
- print(logs)
467
- file_client.upload_file(logs)
468
- print("Logs uploaded to Azure Blob Storage")
469
-
470
-
471
- # def disable_component():
472
- # return gr.update(interactive = False)
473
-
474
-
475
-
476
 
477
  # --------------------------------------------------------------------
478
  # Gradio
@@ -509,29 +320,33 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
509
  # user_id_state = gr.State([user_id])
510
 
511
  with gr.Tab("🌍 ClimateQ&A"):
512
-
513
  with gr.Row(elem_id="chatbot-row"):
514
  with gr.Column(scale=2):
515
  # state = gr.State([system_template])
516
  bot = gr.Chatbot(
517
- value=[[None,init_prompt]],
518
- show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",avatar_images = ("assets/logo4.png",None))
519
-
520
- # bot.like(vote,None,None)
521
-
 
 
522
 
 
523
 
524
- with gr.Row(elem_id = "input-message"):
525
- textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=1,lines = 1,interactive = True)
 
 
 
 
 
 
526
  # submit_button = gr.Button(">",scale = 1,elem_id = "submit-button")
527
 
528
-
529
- with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
530
-
531
-
532
  with gr.Tabs() as tabs:
533
- with gr.TabItem("📝 Examples",elem_id = "tab-examples",id = 0):
534
-
535
  examples_hidden = gr.Textbox(elem_id="hidden-message")
536
 
537
  examples_questions = gr.Examples(
@@ -575,14 +390,16 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
575
  # cache_examples=True,
576
  )
577
 
578
- with gr.Tab("📚 Citations",elem_id = "tab-citations",id = 1):
579
- sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
 
 
580
  docs_textbox = gr.State("")
581
 
582
- with gr.Tab("⚙️ Configuration",elem_id = "tab-config",id = 2):
583
-
584
- gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
585
-
586
 
587
  dropdown_sources = gr.CheckboxGroup(
588
  ["IPCC", "IPBES"],
@@ -592,56 +409,106 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
592
  )
593
 
594
  dropdown_audience = gr.Dropdown(
595
- ["Children","General public","Experts"],
596
  label="Select audience",
597
  value="Experts",
598
  interactive=True,
599
  )
600
 
601
- output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
602
- output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
603
-
604
-
 
 
 
 
 
 
 
 
 
 
605
 
606
  # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
607
- (textbox
608
- .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
609
- .success(change_tab,None,tabs)
610
- .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
611
- .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
612
- .success(lambda x : textbox,[textbox],[textbox])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  )
614
 
615
- (examples_hidden
616
- .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
617
- .success(change_tab,None,tabs)
618
- .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
619
- .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
620
- .success(lambda x : textbox,[textbox],[textbox])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  )
622
  # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
623
  # answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
624
  # )
625
 
 
 
 
626
 
627
-
628
-
629
-
630
-
631
-
632
-
633
-
634
-
635
-
636
-
637
-
638
-
639
- #---------------------------------------------------------------------------------------
640
- # OTHER TABS
641
- #---------------------------------------------------------------------------------------
642
-
643
-
644
- with gr.Tab("ℹ️ About ClimateQ&A",elem_classes = "max-height"):
645
  with gr.Row():
646
  with gr.Column(scale=1):
647
  gr.Markdown(
@@ -660,7 +527,9 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
660
 
661
  with gr.Column(scale=1):
662
  gr.Markdown("![](https://i.postimg.cc/fLvsvMzM/Untitled-design-5.png)")
663
- gr.Markdown("*Source : IPCC AR6 - Synthesis Report of the IPCC 6th assessment report (AR6)*")
 
 
664
 
665
  gr.Markdown("## How to use ClimateQ&A")
666
  with gr.Row():
@@ -688,7 +557,6 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
688
  """
689
  )
690
 
691
-
692
  with gr.Tab("📧 Contact, feedback and feature requests"):
693
  gr.Markdown(
694
  """
@@ -702,37 +570,10 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
702
  *This tool has been developed by the R&D lab at **Ekimetrics** (Jean Lelong, Nina Achache, Gabriel Olympie, Nicolas Chesneau, Natalia De la Calzada, Théo Alves Da Costa)*
703
  """
704
  )
705
- # with gr.Row():
706
- # with gr.Column(scale=1):
707
- # gr.Markdown("### Feedbacks")
708
- # feedback = gr.Textbox(label="Write your feedback here")
709
- # feedback_output = gr.Textbox(label="Submit status")
710
- # feedback_save = gr.Button(value="submit feedback")
711
- # feedback_save.click(
712
- # save_feedback,
713
- # inputs=[feedback, user_id_state],
714
- # outputs=feedback_output,
715
- # )
716
- # gr.Markdown(
717
- # "If you need us to ask another climate science report or ask any question, contact us at <b>theo.alvesdacosta@ekimetrics.com</b>"
718
- # )
719
-
720
- # with gr.Column(scale=1):
721
- # gr.Markdown("### OpenAI API")
722
- # gr.Markdown(
723
- # "To make climate science accessible to a wider audience, we have opened our own OpenAI API key with a monthly cap of $1000. If you already have an API key, please use it to help conserve bandwidth for others."
724
- # )
725
- # openai_api_key_textbox = gr.Textbox(
726
- # placeholder="Paste your OpenAI API key (sk-...) and hit Enter",
727
- # show_label=False,
728
- # lines=1,
729
- # type="password",
730
- # )
731
- # openai_api_key_textbox.change(set_openai_api_key, inputs=[openai_api_key_textbox])
732
- # openai_api_key_textbox.submit(set_openai_api_key, inputs=[openai_api_key_textbox])
733
-
734
- with gr.Tab("📚 Sources",elem_classes = "max-height"):
735
- gr.Markdown("""
736
  | Source | Report | URL | Number of pages | Release date |
737
  | --- | --- | --- | --- | --- |
738
  IPCC | Summary for Policymakers. In: Climate Change 2021: The Physical Science Basis. Contribution of the WGI to the AR6 of the IPCC. | https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_SPM.pdf | 32 | 2021
@@ -770,10 +611,12 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
770
  IPBES | Summary for Policymakers. Regional Assessment Report on Biodiversity and Ecosystem Services for Europe and Central Asia. | https://zenodo.org/record/3237468/files/ipbes_assessment_spm_eca_EN.pdf | 52 | 2018
771
  IPBES | Full Report. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 748 | 2018
772
  IPBES | Summary for Policymakers. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 48 | 2018
773
- """)
 
774
 
775
  with gr.Tab("🛢️ Carbon Footprint"):
776
- gr.Markdown("""
 
777
 
778
  Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)
779
 
@@ -787,10 +630,11 @@ Carbon emissions were measured during the development and inference process usin
787
  Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to ClimateQ&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)
788
  Or around 2 to 4 times more than a typical Google search.
789
  """
790
- )
791
-
792
  with gr.Tab("🪄 Changelog"):
793
- gr.Markdown("""
 
794
 
795
  ##### v1.1.0 - *2023-10-16*
796
  - ClimateQ&A on Hugging Face is finally working again with all the new features !
@@ -805,7 +649,7 @@ Or around 2 to 4 times more than a typical Google search.
805
  - Add children mode on https://climateqa.com
806
  - Add follow-up questions https://climateqa.com
807
  """
808
- )
809
 
810
  demo.queue(concurrency_count=16)
811
 
 
1
  import gradio as gr
 
 
 
 
2
 
3
  from utils import create_user_id
4
 
 
5
 
6
  # Langchain
7
  from langchain.embeddings import HuggingFaceEmbeddings
 
8
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
9
 
10
  # ClimateQ&A imports
11
  from climateqa.llm import get_llm
12
+ from climateqa.logging import log
13
+ from climateqa.chains import load_qa_chain_with_text
14
  from climateqa.chains import load_reformulation_chain
15
  from climateqa.vectorstore import get_pinecone_vectorstore
16
  from climateqa.retriever import ClimateQARetriever
 
19
  # Load environment variables in local mode
20
  try:
21
  from dotenv import load_dotenv
22
+
23
  load_dotenv()
24
  except Exception as e:
25
  pass
 
32
  )
33
 
34
 
 
35
  init_prompt = ""
36
 
37
  system_template = {
 
39
  "content": init_prompt,
40
  }
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  user_id = create_user_id()
43
 
44
+ # ---------------------------------------------------------------------------
45
  # ClimateQ&A core functions
46
+ # ---------------------------------------------------------------------------
47
 
48
  from langchain.callbacks.base import BaseCallbackHandler
49
  from queue import Queue, Empty
50
  from threading import Thread
51
  from collections.abc import Generator
52
  from langchain.schema import LLMResult
53
+ from typing import Any, Union, Dict, List
54
  from queue import SimpleQueue
55
+
56
  # # Create a Queue
57
  # Q = Queue()
58
 
59
  import re
60
 
61
+
62
  def parse_output_llm_with_sources(output):
63
  # Split the content into a list of text and "[Doc X]" references
64
+ content_parts = re.split(r"\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]", output)
65
  parts = []
66
  for part in content_parts:
67
  if part.startswith("Doc"):
68
  subparts = part.split(",")
69
+ subparts = [
70
+ subpart.lower().replace("doc", "").strip() for subpart in subparts
71
+ ]
72
+ subparts = [
73
+ f"<span class='doc-ref'><sup>{subpart}</sup></span>"
74
+ for subpart in subparts
75
+ ]
76
  parts.append("".join(subparts))
77
  else:
78
  parts.append(part)
 
80
  return content_parts
81
 
82
 
83
+ job_done = object() # signals the processing is done
 
84
 
85
 
86
  class StreamingGradioCallbackHandler(BaseCallbackHandler):
 
112
  self.q.put(job_done)
113
 
114
 
 
 
115
  # Create embeddings function and LLM
116
+ embeddings_function = HuggingFaceEmbeddings(
117
+ model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
118
+ )
119
 
120
 
121
  # Create vectorstore and retriever
122
  vectorstore = get_pinecone_vectorstore(embeddings_function)
123
 
124
+ # ---------------------------------------------------------------------------
125
  # ClimateQ&A Streaming
126
  # From https://github.com/gradio-app/gradio/issues/5345
127
  # And https://stackoverflow.com/questions/76057076/how-to-stream-agents-response-in-langchain
128
+ # ---------------------------------------------------------------------------
129
 
130
  from threading import Thread
131
 
 
132
 
133
+ def answer_user(query, query_example, history):
134
  if len(query) <= 2:
135
  raise Exception("Please ask a longer question")
136
  return query, history + [[query, ". . ."]]
137
 
138
+
139
+ def answer_user_example(query, query_example, history):
140
  return query_example, history + [[query_example, ". . ."]]
141
 
 
142
 
143
+ def fetch_sources(query, sources):
144
  # Prepare default values
145
  if len(sources) == 0:
146
  sources = ["IPCC"]
147
 
148
+ llm_reformulation = get_llm(
149
+ max_tokens=512, temperature=0.0, verbose=True, streaming=False
150
+ )
151
+ retriever = ClimateQARetriever(
152
+ vectorstore=vectorstore, sources=sources, k_summary=3, k_total=10
153
+ )
154
  reformulation_chain = load_reformulation_chain(llm_reformulation)
155
 
156
  # Calculate language
157
+ output_reformulation = reformulation_chain({"query": query})
158
  question = output_reformulation["question"]
159
  language = output_reformulation["language"]
160
 
 
162
  docs = retriever.get_relevant_documents(question)
163
 
164
  if len(docs) > 0:
 
165
  # Already display the sources
166
  sources_text = []
167
  for i, d in enumerate(docs, 1):
168
  sources_text.append(make_html_source(d, i))
169
  citations_text = "".join(sources_text)
170
  docs_text = "\n\n".join([d.page_content for d in docs])
171
+ return "", citations_text, docs_text, question, language
172
  else:
173
+ sources_text = (
174
+ "⚠️ No relevant passages found in the scientific reports (IPCC and IPBES)"
175
+ )
176
  citations_text = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
177
  docs_text = ""
178
+ return "", citations_text, docs_text, question, language
 
179
 
 
180
 
181
+ def answer_bot(query, history, docs, question, language, audience):
182
  if audience == "Children":
183
  audience_prompt = audience_prompts["children"]
184
  elif audience == "General public":
 
191
  # Prepare Queue for streaming LLMs
192
  Q = SimpleQueue()
193
 
194
+ llm_streaming = get_llm(
195
+ max_tokens=1024,
196
+ temperature=0.0,
197
+ verbose=True,
198
+ streaming=True,
199
+ callbacks=[StreamingGradioCallbackHandler(Q), StreamingStdOutCallbackHandler()],
200
  )
201
 
202
  qa_chain = load_qa_chain_with_text(llm_streaming)
203
 
204
+ def threaded_chain(question, audience, language, docs):
205
  try:
206
+ response = qa_chain(
207
+ {
208
+ "question": question,
209
+ "audience": audience,
210
+ "language": language,
211
+ "summaries": docs,
212
+ }
213
+ )
214
  Q.put(response)
215
  Q.put(job_done)
216
  except Exception as e:
217
  print(e)
218
+
219
  history[-1][1] = ""
 
 
220
 
221
+ textbox = gr.Textbox(
222
+ placeholder=". . .", show_label=False, scale=1, lines=1, interactive=False
223
+ )
224
 
225
  if len(docs) > 0:
 
226
  # Start thread for streaming
227
  thread = Thread(
228
+ target=threaded_chain,
229
+ kwargs={
230
+ "question": question,
231
+ "audience": audience_prompt,
232
+ "language": language,
233
+ "docs": docs,
234
+ },
235
  )
236
  thread.start()
237
 
238
  while True:
239
+ next_item = Q.get(block=True) # Blocks until an input is available
240
 
241
  if next_item is job_done:
242
  break
 
244
  new_paragraph = history[-1][1] + next_item
245
  new_paragraph = parse_output_llm_with_sources(new_paragraph)
246
  history[-1][1] = new_paragraph
247
+ yield textbox, history
248
  else:
249
  pass
250
  thread.join()
251
 
252
+ log(question=question, history=history, docs=docs, user_id=user_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  else:
255
  complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
256
  history[-1][1] += complete_response
257
+ yield "", history
258
+
259
+
260
+ # ---------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  # ClimateQ&A core functions
262
+ # ---------------------------------------------------------------------------
263
 
264
 
265
+ def make_html_source(source, i):
266
  meta = source.metadata
267
+ content = source.page_content.split(":", 1)[1].strip()
268
  return f"""
269
  <div class="card">
270
  <div class="card-content">
 
281
  """
282
 
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  def reset_textbox():
285
  return gr.update(value="")
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  # --------------------------------------------------------------------
289
  # Gradio
 
320
  # user_id_state = gr.State([user_id])
321
 
322
  with gr.Tab("🌍 ClimateQ&A"):
 
323
  with gr.Row(elem_id="chatbot-row"):
324
  with gr.Column(scale=2):
325
  # state = gr.State([system_template])
326
  bot = gr.Chatbot(
327
+ value=[[None, init_prompt]],
328
+ show_copy_button=True,
329
+ show_label=False,
330
+ elem_id="chatbot",
331
+ layout="panel",
332
+ avatar_images=("assets/logo4.png", None),
333
+ )
334
 
335
+ # bot.like(vote,None,None)
336
 
337
+ with gr.Row(elem_id="input-message"):
338
+ textbox = gr.Textbox(
339
+ placeholder="Ask me anything here!",
340
+ show_label=False,
341
+ scale=1,
342
+ lines=1,
343
+ interactive=True,
344
+ )
345
  # submit_button = gr.Button(">",scale = 1,elem_id = "submit-button")
346
 
347
+ with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
 
 
 
348
  with gr.Tabs() as tabs:
349
+ with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
 
350
  examples_hidden = gr.Textbox(elem_id="hidden-message")
351
 
352
  examples_questions = gr.Examples(
 
390
  # cache_examples=True,
391
  )
392
 
393
+ with gr.Tab("📚 Citations", elem_id="tab-citations", id=1):
394
+ sources_textbox = gr.HTML(
395
+ show_label=False, elem_id="sources-textbox"
396
+ )
397
  docs_textbox = gr.State("")
398
 
399
+ with gr.Tab("⚙️ Configuration", elem_id="tab-config", id=2):
400
+ gr.Markdown(
401
+ "Reminder: You can talk in any language, ClimateQ&A is multi-lingual!"
402
+ )
403
 
404
  dropdown_sources = gr.CheckboxGroup(
405
  ["IPCC", "IPBES"],
 
409
  )
410
 
411
  dropdown_audience = gr.Dropdown(
412
+ ["Children", "General public", "Experts"],
413
  label="Select audience",
414
  value="Experts",
415
  interactive=True,
416
  )
417
 
418
+ output_query = gr.Textbox(
419
+ label="Query used for retrieval",
420
+ show_label=True,
421
+ elem_id="reformulated-query",
422
+ lines=2,
423
+ interactive=False,
424
+ )
425
+ output_language = gr.Textbox(
426
+ label="Language",
427
+ show_label=True,
428
+ elem_id="language",
429
+ lines=1,
430
+ interactive=False,
431
+ )
432
 
433
  # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
434
+ (
435
+ textbox.submit(
436
+ answer_user,
437
+ [textbox, examples_hidden, bot],
438
+ [textbox, bot],
439
+ queue=False,
440
+ )
441
+ .success(change_tab, None, tabs)
442
+ .success(
443
+ fetch_sources,
444
+ [textbox, dropdown_sources],
445
+ [
446
+ textbox,
447
+ sources_textbox,
448
+ docs_textbox,
449
+ output_query,
450
+ output_language,
451
+ ],
452
+ )
453
+ .success(
454
+ answer_bot,
455
+ [
456
+ textbox,
457
+ bot,
458
+ docs_textbox,
459
+ output_query,
460
+ output_language,
461
+ dropdown_audience,
462
+ ],
463
+ [textbox, bot],
464
+ queue=True,
465
+ )
466
+ .success(lambda x: textbox, [textbox], [textbox])
467
  )
468
 
469
+ (
470
+ examples_hidden.change(
471
+ answer_user_example,
472
+ [textbox, examples_hidden, bot],
473
+ [textbox, bot],
474
+ queue=False,
475
+ )
476
+ .success(change_tab, None, tabs)
477
+ .success(
478
+ fetch_sources,
479
+ [textbox, dropdown_sources],
480
+ [
481
+ textbox,
482
+ sources_textbox,
483
+ docs_textbox,
484
+ output_query,
485
+ output_language,
486
+ ],
487
+ )
488
+ .success(
489
+ answer_bot,
490
+ [
491
+ textbox,
492
+ bot,
493
+ docs_textbox,
494
+ output_query,
495
+ output_language,
496
+ dropdown_audience,
497
+ ],
498
+ [textbox, bot],
499
+ queue=True,
500
+ )
501
+ .success(lambda x: textbox, [textbox], [textbox])
502
  )
503
  # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
504
  # answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
505
  # )
506
 
507
+ # ---------------------------------------------------------------------------------------
508
+ # OTHER TABS
509
+ # ---------------------------------------------------------------------------------------
510
 
511
+ with gr.Tab("ℹ️ About ClimateQ&A", elem_classes="max-height"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  with gr.Row():
513
  with gr.Column(scale=1):
514
  gr.Markdown(
 
527
 
528
  with gr.Column(scale=1):
529
  gr.Markdown("![](https://i.postimg.cc/fLvsvMzM/Untitled-design-5.png)")
530
+ gr.Markdown(
531
+ "*Source : IPCC AR6 - Synthesis Report of the IPCC 6th assessment report (AR6)*"
532
+ )
533
 
534
  gr.Markdown("## How to use ClimateQ&A")
535
  with gr.Row():
 
557
  """
558
  )
559
 
 
560
  with gr.Tab("📧 Contact, feedback and feature requests"):
561
  gr.Markdown(
562
  """
 
570
  *This tool has been developed by the R&D lab at **Ekimetrics** (Jean Lelong, Nina Achache, Gabriel Olympie, Nicolas Chesneau, Natalia De la Calzada, Théo Alves Da Costa)*
571
  """
572
  )
573
+
574
+ with gr.Tab("📚 Sources", elem_classes="max-height"):
575
+ gr.Markdown(
576
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  | Source | Report | URL | Number of pages | Release date |
578
  | --- | --- | --- | --- | --- |
579
  IPCC | Summary for Policymakers. In: Climate Change 2021: The Physical Science Basis. Contribution of the WGI to the AR6 of the IPCC. | https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_SPM.pdf | 32 | 2021
 
611
  IPBES | Summary for Policymakers. Regional Assessment Report on Biodiversity and Ecosystem Services for Europe and Central Asia. | https://zenodo.org/record/3237468/files/ipbes_assessment_spm_eca_EN.pdf | 52 | 2018
612
  IPBES | Full Report. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 748 | 2018
613
  IPBES | Summary for Policymakers. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 48 | 2018
614
+ """
615
+ )
616
 
617
  with gr.Tab("🛢️ Carbon Footprint"):
618
+ gr.Markdown(
619
+ """
620
 
621
  Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)
622
 
 
630
  Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to ClimateQ&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)
631
  Or around 2 to 4 times more than a typical Google search.
632
  """
633
+ )
634
+
635
  with gr.Tab("🪄 Changelog"):
636
+ gr.Markdown(
637
+ """
638
 
639
  ##### v1.1.0 - *2023-10-16*
640
  - ClimateQ&A on Hugging Face is finally working again with all the new features !
 
649
  - Add children mode on https://climateqa.com
650
  - Add follow-up questions https://climateqa.com
651
  """
652
+ )
653
 
654
  demo.queue(concurrency_count=16)
655
 
climateqa/logging.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+
5
+ from azure.storage.fileshare import ShareServiceClient
6
+
7
+
8
+ def log(question, history, docs, user_id):
9
+ if has_blob_config():
10
+ log_in_azure(question, history, docs, user_id)
11
+ pass
12
+
13
+
14
+ def has_blob_config():
15
+ """
16
+ Checks if the necessary environment variables for Azure Blob Storage are set.
17
+ Returns True if they are set, False otherwise.
18
+ """
19
+ return all(
20
+ key in os.environ
21
+ for key in ["BLOB_ACCOUNT_KEY", "BLOB_ACCOUNT_NAME", "BLOB_ACCOUNT_URL"]
22
+ )
23
+
24
+
25
+ def log_in_azure(question, history, docs, user_id):
26
+ timestamp = str(datetime.now().timestamp())
27
+ file_name = timestamp + ".json"
28
+ prompt = history[-1][0]
29
+ logs = {
30
+ "user_id": str(user_id),
31
+ "prompt": prompt,
32
+ "query": prompt,
33
+ "question": question,
34
+ "docs": docs,
35
+ "answer": history[-1][1],
36
+ "time": timestamp,
37
+ }
38
+ upload_azure(file_name, logs)
39
+
40
+
41
+ def get_azure_blob_client():
42
+ account_key = os.environ["BLOB_ACCOUNT_KEY"]
43
+ if len(account_key) == 86:
44
+ account_key += "=="
45
+
46
+ credential = {
47
+ "account_key": account_key,
48
+ "account_name": os.environ["BLOB_ACCOUNT_NAME"],
49
+ }
50
+ account_url = os.environ["BLOB_ACCOUNT_URL"]
51
+ file_share_name = "climategpt"
52
+ service = ShareServiceClient(account_url=account_url, credential=credential)
53
+ share_client = service.get_share_client(file_share_name)
54
+ return share_client
55
+
56
+ if has_blob_config():
57
+ share_client = get_azure_blob_client()
58
+
59
+
60
+ def upload_azure(file, logs):
61
+ logs = json.dumps(logs)
62
+ print(type(logs))
63
+ assert share_client is not None
64
+ file_client = share_client.get_file_client(file)
65
+ print("Uploading logs to Azure Blob Storage")
66
+ print("----------------------------------")
67
+ print("")
68
+ print(logs)
69
+ file_client.upload_file(logs)
70
+ print("Logs uploaded to Azure Blob Storage")
climateqa/vectorstore.py CHANGED
@@ -24,21 +24,3 @@ def get_pinecone_vectorstore(embeddings,text_key = "content"):
24
  index_name = os.getenv("PINECONE_API_INDEX")
25
  vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
26
  return vectorstore
27
-
28
-
29
- # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
30
-
31
- # assert isinstance(sources,list)
32
-
33
- # # Check if all elements in the list are either IPCC or IPBES
34
- # filter = {
35
- # "source": { "$in":sources},
36
- # }
37
-
38
- # retriever = vectorstore.as_retriever(search_kwargs={
39
- # "k": k,
40
- # "namespace":"vectors",
41
- # "filter":filter
42
- # })
43
-
44
- # return retriever
 
24
  index_name = os.getenv("PINECONE_API_INDEX")
25
  vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
26
  return vectorstore