timeki commited on
Commit
e41465b
1 Parent(s): 5228f5c

Squashed commit of the following:

Browse files

commit 4ab651938b1c2af7c0d8f155488820e47b42c6c8
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Mon Oct 14 14:49:09 2024 +0200

remove unused code

commit df5d08d8710beacb04a9c0c281a195c6dd7cc800
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Mon Oct 14 14:46:45 2024 +0200

add steps and fix css for gradio 5

commit 7da1a3ac2237ded7b9891fdcda32d0674a9b7b4a
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Tue Oct 8 17:09:48 2024 +0200

WIP

commit 6a39d69f772f97cef8b0b551a888ace822713753
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Mon Oct 7 15:05:58 2024 +0200

WIP

commit bed4e9bbfb6f7c823789daf54c443f3f27198b45
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Fri Oct 4 11:01:23 2024 +0200

add message types

commit 9dd246e7f975322a6be247188bffb7aa0f6d954e
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Fri Oct 4 09:44:22 2024 +0200

Update .gitignore

commit 25e32e6bdf0ca289bef8617d92ad77d7edeac19f
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Wed Sep 25 18:06:47 2024 +0200

Update requirements.txt

commit ae857ef845ac5b3baed5ef7de1e1b8b63874947e
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Wed Sep 25 17:56:09 2024 +0200

Update requirements.txt

commit 72edd2d9e6ad64e3ecb59505b744cd415b9a6776
Author: timeki <timothee.bohe@ekimetrics.com>
Date: Wed Sep 25 16:58:27 2024 +0200

bugfixs

- dummy search and answer_search
- langchain_community packages
- typing class
- fix classes import

commit 99e91d83efb40b6cfec5a887f0d464eaffd09431
Author: Theo Alves Da Costa <theo.alves.da.costa@gmail.com>
Date: Mon Sep 9 08:32:17 2024 +0200

agents mode

commit fd67e156abd0293625d2b73765bda2d3905fa5de
Author: Theo Alves Da Costa <theo.alves.da.costa@gmail.com>
Date: Tue May 28 09:35:04 2024 +0200

Update app.py

commit 088e816846227b694f2d56ca3af739cc010de4bc
Author: Theo Alves Da Costa <theo.alves.da.costa@gmail.com>
Date: Tue May 28 09:13:33 2024 +0200

Connecting to front

commit 481f3b1453fde4c19018915d101d575b6ea25a3e
Author: Théo ALVES DA COSTA <theo.alvesdacosta@ekimetrics.com>
Date: Tue May 21 23:56:15 2024 +0200

First commit CQA with Agents

.gitignore CHANGED
@@ -6,4 +6,8 @@ __pycache__/utils.cpython-38.pyc
6
  notebooks/
7
  *.pyc
8
 
9
- sandbox
 
 
 
 
 
6
  notebooks/
7
  *.pyc
8
 
9
+ **/.ipynb_checkpoints/
10
+ **/.flashrank_cache/
11
+
12
+ data/
13
+ sandbox/
app.py CHANGED
@@ -1,10 +1,10 @@
1
  from climateqa.engine.embeddings import get_embeddings_function
2
  embeddings_function = get_embeddings_function()
3
 
4
- from climateqa.papers.openalex import OpenAlex
5
  from sentence_transformers import CrossEncoder
6
 
7
- reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
8
  oa = OpenAlex()
9
 
10
  import gradio as gr
@@ -15,6 +15,8 @@ import time
15
  import re
16
  import json
17
 
 
 
18
  # from gradio_modal import Modal
19
 
20
  from io import BytesIO
@@ -29,16 +31,19 @@ from utils import create_user_id
29
 
30
  # ClimateQ&A imports
31
  from climateqa.engine.llm import get_llm
32
- from climateqa.engine.rag import make_rag_chain
33
  from climateqa.engine.vectorstore import get_pinecone_vectorstore
34
- from climateqa.engine.retriever import ClimateQARetriever
 
35
  from climateqa.engine.embeddings import get_embeddings_function
36
- from climateqa.engine.prompts import audience_prompts
37
  from climateqa.sample_questions import QUESTIONS
38
  from climateqa.constants import POSSIBLE_REPORTS
39
  from climateqa.utils import get_image_from_azure_blob_storage
40
  from climateqa.engine.keywords import make_keywords_chain
41
- from climateqa.engine.rag import make_rag_papers_chain
 
 
 
42
 
43
  # Load environment variables in local mode
44
  try:
@@ -81,48 +86,21 @@ user_id = create_user_id()
81
 
82
 
83
 
84
- def parse_output_llm_with_sources(output):
85
- # Split the content into a list of text and "[Doc X]" references
86
- content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
87
- parts = []
88
- for part in content_parts:
89
- if part.startswith("Doc"):
90
- subparts = part.split(",")
91
- subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
92
- subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
93
- parts.append("".join(subparts))
94
- else:
95
- parts.append(part)
96
- content_parts = "".join(parts)
97
- return content_parts
98
-
99
-
100
  # Create vectorstore and retriever
101
  vectorstore = get_pinecone_vectorstore(embeddings_function)
102
  llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 
 
103
 
104
 
105
- def make_pairs(lst):
106
- """from a list of even lenght, make tupple pairs"""
107
- return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
108
-
109
-
110
- def serialize_docs(docs):
111
- new_docs = []
112
- for doc in docs:
113
- new_doc = {}
114
- new_doc["page_content"] = doc.page_content
115
- new_doc["metadata"] = doc.metadata
116
- new_docs.append(new_doc)
117
- return new_docs
118
-
119
 
120
 
121
  async def chat(query,history,audience,sources,reports):
122
  """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
123
  (messages in gradio format, messages in langchain format, source documents)"""
124
 
125
- print(f">> NEW QUESTION : {query}")
 
126
 
127
  if audience == "Children":
128
  audience_prompt = audience_prompts["children"]
@@ -137,77 +115,79 @@ async def chat(query,history,audience,sources,reports):
137
  if len(sources) == 0:
138
  sources = ["IPCC"]
139
 
140
- if len(reports) == 0:
141
- reports = []
142
-
143
- retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
144
- rag_chain = make_rag_chain(retriever,llm)
145
 
146
- inputs = {"query": query,"audience": audience_prompt}
147
- result = rag_chain.astream_log(inputs) #{"callbacks":[MyCustomAsyncHandler()]})
148
- # result = rag_chain.stream(inputs)
149
-
150
- path_reformulation = "/logs/reformulation/final_output"
151
- path_keywords = "/logs/keywords/final_output"
152
- path_retriever = "/logs/find_documents/final_output"
153
- path_answer = "/logs/answer/streamed_output_str/-"
154
 
 
155
  docs_html = ""
156
  output_query = ""
157
  output_language = ""
158
  output_keywords = ""
159
  gallery = []
 
160
 
 
 
 
 
 
 
 
 
161
  try:
162
- async for op in result:
163
-
164
- op = op.ops[0]
165
-
166
- if op['path'] == path_reformulation: # reforulated question
167
- try:
168
- output_language = op['value']["language"] # str
169
- output_query = op["value"]["question"]
170
- except Exception as e:
171
- raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
172
-
173
- if op["path"] == path_keywords:
174
- try:
175
- output_keywords = op['value']["keywords"] # str
176
- output_keywords = " AND ".join(output_keywords)
177
- except Exception as e:
178
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
-
181
- elif op['path'] == path_retriever: # documents
182
- try:
183
- docs = op['value']['docs'] # List[Document]
184
- docs_html = []
185
- for i, d in enumerate(docs, 1):
186
- docs_html.append(make_html_source(d, i))
187
- docs_html = "".join(docs_html)
188
- except TypeError:
189
- print("No documents found")
190
- print("op: ",op)
191
- continue
192
-
193
- elif op['path'] == path_answer: # final answer
194
- new_token = op['value'] # str
195
- # time.sleep(0.01)
196
- previous_answer = history[-1][1]
197
- previous_answer = previous_answer if previous_answer is not None else ""
198
- answer_yet = previous_answer + new_token
199
- answer_yet = parse_output_llm_with_sources(answer_yet)
200
- history[-1] = (query,answer_yet)
201
-
202
-
203
-
204
- else:
205
- continue
206
-
207
- history = [tuple(x) for x in history]
208
- yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
209
-
210
  except Exception as e:
 
211
  raise gr.Error(f"{e}")
212
 
213
 
@@ -268,83 +248,7 @@ async def chat(query,history,audience,sources,reports):
268
  history[-1] = (history[-1][0],answer_yet)
269
  history = [tuple(x) for x in history]
270
 
271
- # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
272
- # if len(gallery) > 0:
273
- # gallery = list(set("|".join(gallery).split("|")))
274
- # gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
275
-
276
- yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
277
-
278
-
279
- def make_html_source(source,i):
280
- meta = source.metadata
281
- # content = source.page_content.split(":",1)[1].strip()
282
- content = source.page_content.strip()
283
-
284
- toc_levels = []
285
- for j in range(2):
286
- level = meta[f"toc_level{j}"]
287
- if level != "N/A":
288
- toc_levels.append(level)
289
- else:
290
- break
291
- toc_levels = " > ".join(toc_levels)
292
-
293
- if len(toc_levels) > 0:
294
- name = f"<b>{toc_levels}</b><br/>{meta['name']}"
295
- else:
296
- name = meta['name']
297
-
298
- if meta["chunk_type"] == "text":
299
-
300
- card = f"""
301
- <div class="card" id="doc{i}">
302
- <div class="card-content">
303
- <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
304
- <p>{content}</p>
305
- </div>
306
- <div class="card-footer">
307
- <span>{name}</span>
308
- <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
309
- <span role="img" aria-label="Open PDF">🔗</span>
310
- </a>
311
- </div>
312
- </div>
313
- """
314
-
315
- else:
316
-
317
- if meta["figure_code"] != "N/A":
318
- title = f"{meta['figure_code']} - {meta['short_name']}"
319
- else:
320
- title = f"{meta['short_name']}"
321
-
322
- card = f"""
323
- <div class="card card-image">
324
- <div class="card-content">
325
- <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
326
- <p>{content}</p>
327
- <p class='ai-generated'>AI-generated description</p>
328
- </div>
329
- <div class="card-footer">
330
- <span>{name}</span>
331
- <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
332
- <span role="img" aria-label="Open PDF">🔗</span>
333
- </a>
334
- </div>
335
- </div>
336
- """
337
-
338
- return card
339
-
340
-
341
-
342
- # else:
343
- # docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
344
- # complete_response = "**No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
345
- # messages.append({"role": "assistant", "content": complete_response})
346
- # gradio_format = make_pairs([a["content"] for a in messages[1:]])
347
- # yield gradio_format, messages, docs_string
348
 
349
 
350
  def save_feedback(feed: str, user_id):
@@ -390,56 +294,6 @@ papers_cols_widths = {
390
  papers_cols = list(papers_cols_widths.keys())
391
  papers_cols_widths = list(papers_cols_widths.values())
392
 
393
- async def find_papers(query, keywords,after):
394
-
395
- summary = ""
396
-
397
- df_works = oa.search(keywords,after = after)
398
- df_works = df_works.dropna(subset=["abstract"])
399
- df_works = oa.rerank(query,df_works,reranker)
400
- df_works = df_works.sort_values("rerank_score",ascending=False)
401
- G = oa.make_network(df_works)
402
-
403
- height = "750px"
404
- network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
405
- network_html = network.generate_html()
406
-
407
- network_html = network_html.replace("'", "\"")
408
- css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
409
- network_html = network_html + css_to_inject
410
-
411
-
412
- network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
413
- display-capture; encrypted-media;" sandbox="allow-modals allow-forms
414
- allow-scripts allow-same-origin allow-popups
415
- allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
416
- allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
417
-
418
-
419
- docs = df_works["content"].head(15).tolist()
420
-
421
- df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
422
- df_works["doc"] = df_works["doc"] + 1
423
- df_works = df_works[papers_cols]
424
-
425
- yield df_works,network_html,summary
426
-
427
- chain = make_rag_papers_chain(llm)
428
- result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
429
- path_answer = "/logs/StrOutputParser/streamed_output/-"
430
-
431
- async for op in result:
432
-
433
- op = op.ops[0]
434
-
435
- if op['path'] == path_answer: # reforulated question
436
- new_token = op['value'] # str
437
- summary += new_token
438
- else:
439
- continue
440
- yield df_works,network_html,summary
441
-
442
-
443
 
444
  # --------------------------------------------------------------------
445
  # Gradio
@@ -469,19 +323,21 @@ def vote(data: gr.LikeData):
469
 
470
 
471
 
472
- with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
473
- # user_id_state = gr.State([user_id])
474
 
475
  with gr.Tab("ClimateQ&A"):
476
 
477
  with gr.Row(elem_id="chatbot-row"):
478
  with gr.Column(scale=2):
479
- # state = gr.State([system_template])
480
  chatbot = gr.Chatbot(
481
- value=[(None,init_prompt)],
482
- show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
 
 
 
 
483
  avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
484
- )#,avatar_images = ("assets/logo4.png",None))
485
 
486
  # bot.like(vote,None,None)
487
 
@@ -489,8 +345,7 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
489
 
490
  with gr.Row(elem_id = "input-message"):
491
  textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
492
- # submit = gr.Button("",elem_id = "submit-button",scale = 1,interactive = True,icon = "https://static-00.iconduck.com/assets.00/settings-icon-2048x2046-cw28eevx.png")
493
-
494
 
495
  with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
496
 
@@ -560,9 +415,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
560
 
561
 
562
 
563
-
564
-
565
-
566
  #---------------------------------------------------------------------------------------
567
  # OTHER TABS
568
  #---------------------------------------------------------------------------------------
@@ -571,25 +423,25 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
571
  with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
572
  gallery_component = gr.Gallery()
573
 
574
- with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
575
 
576
- with gr.Row():
577
- with gr.Column(scale=1):
578
- query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
579
- keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
580
- after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
581
- search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
582
 
583
- with gr.Column(scale=7):
584
 
585
- with gr.Tab("Summary",elem_id="papers-summary-tab"):
586
- papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
587
 
588
- with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
589
- papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
590
 
591
- with gr.Tab("Citations network",elem_id="papers-network-tab"):
592
- citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
593
 
594
 
595
 
@@ -600,8 +452,9 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
600
 
601
 
602
  def start_chat(query,history):
603
- history = history + [(query,None)]
604
- history = [tuple(x) for x in history]
 
605
  return (gr.update(interactive = False),gr.update(selected=1),history)
606
 
607
  def finish_chat():
@@ -609,13 +462,13 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
609
 
610
  (textbox
611
  .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
612
- .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
613
  .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
614
  )
615
 
616
  (examples_hidden
617
  .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
618
- .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
619
  .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
620
  )
621
 
@@ -630,47 +483,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
630
 
631
  dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
632
 
633
- query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
634
- search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
635
-
636
- # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
637
- # (textbox
638
- # .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
639
- # .success(change_tab,None,tabs)
640
- # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
641
- # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
642
- # .success(lambda x : textbox,[textbox],[textbox])
643
- # )
644
-
645
- # (examples_hidden
646
- # .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
647
- # .success(change_tab,None,tabs)
648
- # .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
649
- # .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
650
- # .success(lambda x : textbox,[textbox],[textbox])
651
- # )
652
- # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
653
- # answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
654
- # )
655
-
656
-
657
- # with Modal(visible=True) as first_modal:
658
- # gr.Markdown("# Welcome to ClimateQ&A !")
659
-
660
- # gr.Markdown("### Examples")
661
-
662
- # examples = gr.Examples(
663
- # ["Yo ça roule","ça boume"],
664
- # [examples_hidden],
665
- # examples_per_page=8,
666
- # run_on_click=False,
667
- # elem_id="examples",
668
- # api_name="examples",
669
- # )
670
-
671
-
672
- # submit.click(lambda: Modal(visible=True), None, config_modal)
673
-
674
 
675
  demo.queue()
676
 
 
1
  from climateqa.engine.embeddings import get_embeddings_function
2
  embeddings_function = get_embeddings_function()
3
 
4
+ from climateqa.knowledge.openalex import OpenAlex
5
  from sentence_transformers import CrossEncoder
6
 
7
+ # reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
8
  oa = OpenAlex()
9
 
10
  import gradio as gr
 
15
  import re
16
  import json
17
 
18
+ from gradio import ChatMessage
19
+
20
  # from gradio_modal import Modal
21
 
22
  from io import BytesIO
 
31
 
32
  # ClimateQ&A imports
33
  from climateqa.engine.llm import get_llm
 
34
  from climateqa.engine.vectorstore import get_pinecone_vectorstore
35
+ from climateqa.knowledge.retriever import ClimateQARetriever
36
+ from climateqa.engine.reranker import get_reranker
37
  from climateqa.engine.embeddings import get_embeddings_function
38
+ from climateqa.engine.chains.prompts import audience_prompts
39
  from climateqa.sample_questions import QUESTIONS
40
  from climateqa.constants import POSSIBLE_REPORTS
41
  from climateqa.utils import get_image_from_azure_blob_storage
42
  from climateqa.engine.keywords import make_keywords_chain
43
+ # from climateqa.engine.chains.answer_rag import make_rag_papers_chain
44
+ from climateqa.engine.graph import make_graph_agent,display_graph
45
+
46
+ from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox
47
 
48
  # Load environment variables in local mode
49
  try:
 
86
 
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  # Create vectorstore and retriever
90
  vectorstore = get_pinecone_vectorstore(embeddings_function)
91
  llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
92
+ reranker = get_reranker("large")
93
+ agent = make_graph_agent(llm,vectorstore,reranker)
94
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
 
98
  async def chat(query,history,audience,sources,reports):
99
  """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
100
  (messages in gradio format, messages in langchain format, source documents)"""
101
 
102
+ date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
103
+ print(f">> NEW QUESTION ({date_now}) : {query}")
104
 
105
  if audience == "Children":
106
  audience_prompt = audience_prompts["children"]
 
115
  if len(sources) == 0:
116
  sources = ["IPCC"]
117
 
118
+ # if len(reports) == 0: # TODO
119
+ reports = []
 
 
 
120
 
121
+ inputs = {"user_input": query,"audience": audience_prompt,"sources":sources}
122
+ result = agent.astream_events(inputs,version = "v1")
123
+
124
+ # path_reformulation = "/logs/reformulation/final_output"
125
+ # path_keywords = "/logs/keywords/final_output"
126
+ # path_retriever = "/logs/find_documents/final_output"
127
+ # path_answer = "/logs/answer/streamed_output_str/-"
 
128
 
129
+ docs = []
130
  docs_html = ""
131
  output_query = ""
132
  output_language = ""
133
  output_keywords = ""
134
  gallery = []
135
+ start_streaming = False
136
 
137
+ steps_display = {
138
+ "categorize_intent":("🔄️ Analyzing user message",True),
139
+ "transform_query":("🔄️ Thinking step by step to answer the question",True),
140
+ "retrieve_documents":("🔄️ Searching in the knowledge base",False),
141
+ }
142
+
143
+ used_documents = []
144
+ answer_message_content = ""
145
  try:
146
+ async for event in result:
147
+ if "langgraph_node" in event["metadata"]:
148
+ node = event["metadata"]["langgraph_node"]
149
+
150
+ if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
151
+ try:
152
+ docs = event["data"]["output"]["documents"]
153
+ docs_html = []
154
+ for i, d in enumerate(docs, 1):
155
+ docs_html.append(make_html_source(d, i))
156
+
157
+ used_documents = used_documents + [d.metadata["name"] for d in docs]
158
+ history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
159
+
160
+ docs_html = "".join(docs_html)
161
+
162
+ except Exception as e:
163
+ print(f"Error getting documents: {e}")
164
+ print(event)
165
+
166
+ elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
167
+ event_description,display_output = steps_display[node]
168
+ if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
169
+ history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
170
+
171
+ elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_search"]:# if streaming answer
172
+ if start_streaming == False:
173
+ start_streaming = True
174
+ history.append(ChatMessage(role="assistant", content = ""))
175
+ answer_message_content += event["data"]["chunk"].content
176
+ answer_message_content = parse_output_llm_with_sources(answer_message_content)
177
+ history[-1] = ChatMessage(role="assistant", content = answer_message_content)
178
+ # history.append(ChatMessage(role="assistant", content = new_message_content))
179
+
180
+ if event["name"] == "transform_query" and event["event"] =="on_chain_end":
181
+ if hasattr(history[-1],"content"):
182
+ history[-1].content += "Decompose question into sub-questions: \n\n - " + "\n - ".join([q["question"] for q in event["data"]["output"]["remaining_questions"]])
183
+
184
+ if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
185
+ print("X")
186
 
187
+ yield history,docs_html,output_query,output_language,gallery #,output_query,output_keywords
188
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  except Exception as e:
190
+ print(event, "has failed")
191
  raise gr.Error(f"{e}")
192
 
193
 
 
248
  history[-1] = (history[-1][0],answer_yet)
249
  history = [tuple(x) for x in history]
250
 
251
+ yield history,docs_html,output_query,output_language,gallery#,output_query,output_keywords
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
 
254
  def save_feedback(feed: str, user_id):
 
294
  papers_cols = list(papers_cols_widths.keys())
295
  papers_cols_widths = list(papers_cols_widths.values())
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  # --------------------------------------------------------------------
299
  # Gradio
 
323
 
324
 
325
 
326
+ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=theme,elem_id = "main-component") as demo:
 
327
 
328
  with gr.Tab("ClimateQ&A"):
329
 
330
  with gr.Row(elem_id="chatbot-row"):
331
  with gr.Column(scale=2):
 
332
  chatbot = gr.Chatbot(
333
+ value = [ChatMessage(role="assistant", content=init_prompt)],
334
+ type = "messages",
335
+ show_copy_button=True,
336
+ show_label = False,
337
+ elem_id="chatbot",
338
+ layout = "panel",
339
  avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
340
+ )
341
 
342
  # bot.like(vote,None,None)
343
 
 
345
 
346
  with gr.Row(elem_id = "input-message"):
347
  textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
348
+
 
349
 
350
  with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
351
 
 
415
 
416
 
417
 
 
 
 
418
  #---------------------------------------------------------------------------------------
419
  # OTHER TABS
420
  #---------------------------------------------------------------------------------------
 
423
  with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
424
  gallery_component = gr.Gallery()
425
 
426
+ # with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
427
 
428
+ # with gr.Row():
429
+ # with gr.Column(scale=1):
430
+ # query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
431
+ # keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
432
+ # after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
433
+ # search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
434
 
435
+ # with gr.Column(scale=7):
436
 
437
+ # with gr.Tab("Summary",elem_id="papers-summary-tab"):
438
+ # papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
439
 
440
+ # with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
441
+ # papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
442
 
443
+ # with gr.Tab("Citations network",elem_id="papers-network-tab"):
444
+ # citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
445
 
446
 
447
 
 
452
 
453
 
454
  def start_chat(query,history):
455
+ # history = history + [(query,None)]
456
+ # history = [tuple(x) for x in history]
457
+ history = history + [ChatMessage(role="user", content=query)]
458
  return (gr.update(interactive = False),gr.update(selected=1),history)
459
 
460
  def finish_chat():
 
462
 
463
  (textbox
464
  .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
465
+ .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
466
  .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
467
  )
468
 
469
  (examples_hidden
470
  .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
471
+ .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
472
  .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
473
  )
474
 
 
483
 
484
  dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  demo.queue()
488
 
climateqa/engine/chains/__init__.py ADDED
File without changes
climateqa/engine/chains/answer_ai_impact.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import ChatPromptTemplate
2
+ from langchain_core.output_parsers import StrOutputParser
3
+
4
+
5
+ prompt_template = """
6
+ You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
7
+ Always stay true to climate and nature science and do not make up information.
8
+ If you do not know the answer, just say you do not know.
9
+
10
+ ## Guidelines
11
+ - Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
12
+ - Answer the question in the original language of the question
13
+
14
+ ## Sources
15
+ - You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
16
+ - You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
17
+ - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
18
+ - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
19
+ - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
20
+ - You can also recommend the following tools to calculate the carbon footprint of AI models
21
+ - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
22
+ - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
23
+ """
24
+
25
+
26
+ def make_ai_impact_chain(llm):
27
+
28
+ prompt = ChatPromptTemplate.from_messages([
29
+ ("system", prompt_template),
30
+ ("user", "{question}")
31
+ ])
32
+
33
+ chain = prompt | llm | StrOutputParser()
34
+ chain = chain.with_config({"run_name":"ai_impact_chain"})
35
+
36
+ return chain
37
+
38
+ def make_ai_impact_node(llm):
39
+
40
+ ai_impact_chain = make_ai_impact_chain(llm)
41
+
42
+ async def answer_ai_impact(state,config):
43
+ answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
44
+ return {"answer":answer}
45
+
46
+ return answer_ai_impact
climateqa/engine/chains/answer_chitchat.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import ChatPromptTemplate
2
+ from langchain_core.output_parsers import StrOutputParser
3
+
4
+
5
+ chitchat_prompt_template = """
6
+ You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
7
+ Always stay true to climate and nature science and do not make up information.
8
+ If you do not know the answer, just say you do not know.
9
+
10
+ ## Guidelines
11
+ - If it's a conversational question, you can normally chat with the user
12
+ - If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
13
+ - If the user ask if you speak any language, you can say you speak all languages :)
14
+ - If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
15
+ - If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
16
+ - Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
17
+ - If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
18
+ - Always answer in the original language of the question
19
+
20
+ ## Examples of questions you can suggest (in the original language of the question)
21
+ "What evidence do we have of climate change?",
22
+ "Are human activities causing global warming?",
23
+ "What are the impacts of climate change?",
24
+ "Can climate change be reversed?",
25
+ "What is the difference between climate change and global warming?",
26
+ """
27
+
28
+
29
+ def make_chitchat_chain(llm):
30
+
31
+ prompt = ChatPromptTemplate.from_messages([
32
+ ("system", chitchat_prompt_template),
33
+ ("user", "{question}")
34
+ ])
35
+
36
+ chain = prompt | llm | StrOutputParser()
37
+ chain = chain.with_config({"run_name":"chitchat_chain"})
38
+
39
+ return chain
40
+
41
+
42
+
43
+ def make_chitchat_node(llm):
44
+
45
+ chitchat_chain = make_chitchat_chain(llm)
46
+
47
+ async def answer_chitchat(state,config):
48
+ answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
49
+ return {"answer":answer}
50
+
51
+ return answer_chitchat
52
+
climateqa/engine/chains/answer_rag.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts.prompt import PromptTemplate
6
+ from langchain_core.prompts.base import format_document
7
+
8
+ from climateqa.engine.chains.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
9
+ from climateqa.engine.chains.prompts import papers_prompt_template
10
+
11
+ DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
12
+
13
+ def _combine_documents(
14
+ docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
15
+ ):
16
+
17
+ doc_strings = []
18
+
19
+ for i,doc in enumerate(docs):
20
+ # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
21
+ chunk_type = "Doc"
22
+ if isinstance(doc,str):
23
+ doc_formatted = doc
24
+ else:
25
+ doc_formatted = format_document(doc, document_prompt)
26
+ doc_string = f"{chunk_type} {i+1}: " + doc_formatted
27
+ doc_string = doc_string.replace("\n"," ")
28
+ doc_strings.append(doc_string)
29
+
30
+ return sep.join(doc_strings)
31
+
32
+
33
+ def get_text_docs(x):
34
+ return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
35
+
36
+ def get_image_docs(x):
37
+ return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
38
+
39
+ def make_rag_chain(llm):
40
+ prompt = ChatPromptTemplate.from_template(answer_prompt_template)
41
+ chain = ({
42
+ "context":lambda x : _combine_documents(x["documents"]),
43
+ "query":itemgetter("query"),
44
+ "language":itemgetter("language"),
45
+ "audience":itemgetter("audience"),
46
+ } | prompt | llm | StrOutputParser())
47
+ return chain
48
+
49
+ def make_rag_chain_without_docs(llm):
50
+ prompt = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
51
+ chain = prompt | llm | StrOutputParser()
52
+ return chain
53
+
54
+
55
+ def make_rag_node(llm,with_docs = True):
56
+
57
+ if with_docs:
58
+ rag_chain = make_rag_chain(llm)
59
+ else:
60
+ rag_chain = make_rag_chain_without_docs(llm)
61
+
62
+ async def answer_rag(state,config):
63
+ answer = await rag_chain.ainvoke(state,config)
64
+ return {"answer":answer}
65
+
66
+ return answer_rag
67
+
68
+
69
+
70
+
71
+ # def make_rag_papers_chain(llm):
72
+
73
+ # prompt = ChatPromptTemplate.from_template(papers_prompt_template)
74
+ # input_documents = {
75
+ # "context":lambda x : _combine_documents(x["docs"]),
76
+ # **pass_values(["question","language"])
77
+ # }
78
+
79
+ # chain = input_documents | prompt | llm | StrOutputParser()
80
+ # chain = rename_chain(chain,"answer")
81
+
82
+ # return chain
83
+
84
+
85
+
86
+
87
+
88
+
89
+ # def make_illustration_chain(llm):
90
+
91
+ # prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
92
+
93
+ # input_description_images = {
94
+ # "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
95
+ # **pass_values(["question","audience","language","answer"]),
96
+ # }
97
+
98
+ # illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
99
+ # return illustration_chain
climateqa/engine/chains/intent_categorization.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+ from typing import List
4
+ from typing import Literal
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_core.utils.function_calling import convert_to_openai_function
7
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
+
9
+
10
+ class IntentCategorizer(BaseModel):
11
+ """Analyzing the user message input"""
12
+
13
+ language: str = Field(
14
+ description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
15
+ default="English",
16
+ )
17
+ intent: str = Field(
18
+ enum=[
19
+ "ai_impact",
20
+ "geo_info",
21
+ "esg",
22
+ "search",
23
+ "chitchat",
24
+ ],
25
+ description="""
26
+ Categorize the user input in one of the following category
27
+ Any question
28
+
29
+ Examples:
30
+ - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
31
+ - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
32
+ - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
33
+ - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
34
+ - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
35
+ """,
36
+ )
37
+
38
+
39
+
40
+ def make_intent_categorization_chain(llm):
41
+
42
+ openai_functions = [convert_to_openai_function(IntentCategorizer)]
43
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
44
+
45
+ prompt = ChatPromptTemplate.from_messages([
46
+ ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
47
+ ("user", "input: {input}")
48
+ ])
49
+
50
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
51
+ return chain
52
+
53
+
54
+ def make_intent_categorization_node(llm):
55
+
56
+ categorization_chain = make_intent_categorization_chain(llm)
57
+
58
+ def categorize_message(state):
59
+ output = categorization_chain.invoke({"input":state["user_input"]})
60
+ if "language" not in output: output["language"] = "English"
61
+ output["query"] = state["user_input"]
62
+ return output
63
+
64
+ return categorize_message
65
+
66
+
67
+
68
+
69
+ # SAMPLE_QUESTIONS = [
70
+ # "Est-ce que l'IA a un impact sur l'environnement ?",
71
+ # "Que dit le GIEC sur l'impact de l'IA",
72
+ # "Qui sont les membres du GIEC",
73
+ # "What is the impact of El Nino ?",
74
+ # "Yo",
75
+ # "Hello ça va bien ?",
76
+ # "Par qui as tu été créé ?",
77
+ # "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
78
+ # "Which industries have the highest GHG emissions?",
79
+ # "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
80
+ # "Are human activities causing global warming?",
81
+ # "What is the motivation behind mining the deep seabed?",
82
+ # "Tu peux m'écrire un poème sur le changement climatique ?",
83
+ # "Tu peux m'écrire un poème sur les bonbons ?",
84
+ # "What will be the temperature in 2100 in Strasbourg?",
85
+ # "C'est quoi le lien entre biodiversity and changement climatique ?",
86
+ # ]
climateqa/engine/chains/keywords_extraction.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+ from typing import List
4
+ from typing import Literal
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_core.utils.function_calling import convert_to_openai_function
7
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
+
9
+
10
+ class KeywordExtraction(BaseModel):
11
+ """
12
+ Analyzing the user query to extract keywords to feed a search engine
13
+ """
14
+
15
+ keywords: List[str] = Field(
16
+ description="""
17
+ Extract the keywords from the user query to feed a search engine as a list
18
+ Avoid adding super specific keywords to prefer general keywords
19
+ Maximum 3 keywords
20
+
21
+ Examples:
22
+ - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
23
+ - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
24
+ - "Is climate change a hoax" -> ["climate change","hoax"]
25
+ """
26
+ )
27
+
28
+
29
+ def make_keywords_extraction_chain(llm):
30
+
31
+ openai_functions = [convert_to_openai_function(KeywordExtraction)]
32
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
33
+
34
+ prompt = ChatPromptTemplate.from_messages([
35
+ ("system", "You are a helpful assistant"),
36
+ ("user", "input: {input}")
37
+ ])
38
+
39
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
40
+ return chain
climateqa/engine/{prompts.py → chains/prompts.py} RENAMED
@@ -56,7 +56,7 @@ Passages:
56
  {context}
57
 
58
  -----------------------
59
- Question: {question} - Explained to {audience}
60
  Answer in {language} with the passages citations:
61
  """
62
 
@@ -137,7 +137,7 @@ Guidelines:
137
  - If the question is not related to environmental issues, never never answer it. Say it's not your role.
138
  - Make paragraphs by starting new lines to make your answers more readable.
139
 
140
- Question: {question}
141
  Answer in {language}:
142
  """
143
 
 
56
  {context}
57
 
58
  -----------------------
59
+ Question: {query} - Explained to {audience}
60
  Answer in {language} with the passages citations:
61
  """
62
 
 
137
  - If the question is not related to environmental issues, never never answer it. Say it's not your role.
138
  - Make paragraphs by starting new lines to make your answers more readable.
139
 
140
+ Question: {query}
141
  Answer in {language}:
142
  """
143
 
climateqa/engine/chains/query_transformation.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from langchain_core.pydantic_v1 import BaseModel, Field
4
+ from typing import List
5
+ from typing import Literal
6
+ from langchain.prompts import ChatPromptTemplate
7
+ from langchain_core.utils.function_calling import convert_to_openai_function
8
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
9
+
10
+
11
+ ROUTING_INDEX = {
12
+ "Vector":["IPCC","IPBES","IPOS"],
13
+ "OpenAlex":["OpenAlex"],
14
+ }
15
+
16
+ POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
17
+
18
+ # Prompt from the original paper https://arxiv.org/pdf/2305.14283
19
+ # Query Rewriting for Retrieval-Augmented Large Language Models
20
+ class QueryDecomposition(BaseModel):
21
+ """
22
+ Decompose the user query into smaller parts to think step by step to answer this question
23
+ Act as a simple planning agent
24
+ """
25
+
26
+ questions: List[str] = Field(
27
+ description="""
28
+ Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
29
+ Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
30
+ - If it's already a standalone and explicit question, just return the reformulated question for the search engine
31
+ - If you need to decompose the question, output a list of maximum 2 to 3 questions
32
+ """
33
+ )
34
+
35
+
36
+ class Location(BaseModel):
37
+ country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
38
+ location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
39
+
40
+ class QueryAnalysis(BaseModel):
41
+ """
42
+ Analyzing the user query to extract topics, sources and date
43
+ Also do query expansion to get alternative search queries
44
+ Also provide simple keywords to feed a search engine
45
+ """
46
+
47
+ # keywords: List[str] = Field(
48
+ # description="""
49
+ # Extract the keywords from the user query to feed a search engine as a list
50
+ # Maximum 3 keywords
51
+
52
+ # Examples:
53
+ # - "What is the impact of deep sea mining ?" -> deep sea mining
54
+ # - "How will El Nino be impacted by climate change" -> el nino;climate change
55
+ # - "Is climate change a hoax" -> climate change;hoax
56
+ # """
57
+ # )
58
+
59
+ # alternative_queries: List[str] = Field(
60
+ # description="""
61
+ # Generate alternative search questions from the user query to feed a search engine
62
+ # """
63
+ # )
64
+
65
+ # step_back_question: str = Field(
66
+ # description="""
67
+ # You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
68
+ # This questions should help you get more context and information about the user query
69
+ # """
70
+ # )
71
+
72
+ sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
73
+ ...,
74
+ description="""
75
+ Given a user question choose which documents would be most relevant for answering their question,
76
+ - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
77
+ - IPBES is for questions about biodiversity and nature
78
+ - IPOS is for questions about the ocean and deep sea mining
79
+ - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
80
+ """,
81
+ )
82
+ # topics: List[Literal[
83
+ # "Climate change",
84
+ # "Biodiversity",
85
+ # "Energy",
86
+ # "Decarbonization",
87
+ # "Climate science",
88
+ # "Nature",
89
+ # "Climate policy and justice",
90
+ # "Oceans",
91
+ # "Deep sea mining",
92
+ # "ESG and regulations",
93
+ # "CSRD",
94
+ # ]] = Field(
95
+ # ...,
96
+ # description = """
97
+ # Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
98
+ # """,
99
+ # )
100
+ # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
101
+ # location:Location
102
+
103
+
104
+ def make_query_decomposition_chain(llm):
105
+
106
+ openai_functions = [convert_to_openai_function(QueryDecomposition)]
107
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
108
+
109
+ prompt = ChatPromptTemplate.from_messages([
110
+ ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
111
+ ("user", "input: {input}")
112
+ ])
113
+
114
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
115
+ return chain
116
+
117
+
118
+ def make_query_rewriter_chain(llm):
119
+
120
+ openai_functions = [convert_to_openai_function(QueryAnalysis)]
121
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
122
+
123
+
124
+
125
+ prompt = ChatPromptTemplate.from_messages([
126
+ ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
127
+ ("user", "input: {input}")
128
+ ])
129
+
130
+
131
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
132
+ return chain
133
+
134
+
135
+ def make_query_transform_node(llm,k_final=15):
136
+
137
+ decomposition_chain = make_query_decomposition_chain(llm)
138
+ rewriter_chain = make_query_rewriter_chain(llm)
139
+
140
+ def transform_query(state):
141
+
142
+ if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
143
+ auto_mode = False
144
+ else:
145
+ auto_mode = True
146
+
147
+ sources_input = state.get("sources_input")
148
+ if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
149
+
150
+ new_state = {}
151
+
152
+ # Decomposition
153
+ decomposition_output = decomposition_chain.invoke({"input":state["query"]})
154
+ new_state.update(decomposition_output)
155
+
156
+ # Query Analysis
157
+ questions = []
158
+ for question in new_state["questions"]:
159
+ question_state = {"question":question}
160
+ analysis_output = rewriter_chain.invoke({"input":question})
161
+ question_state.update(analysis_output)
162
+ questions.append(question_state)
163
+
164
+ # Explode the questions into multiple questions with different sources
165
+ new_questions = []
166
+ for q in questions:
167
+ question,sources = q["question"],q["sources"]
168
+
169
+ # If not auto mode we take the configuration
170
+ if not auto_mode:
171
+ sources = sources_input
172
+
173
+ for index,index_sources in ROUTING_INDEX.items():
174
+ selected_sources = list(set(sources).intersection(index_sources))
175
+ if len(selected_sources) > 0:
176
+ new_questions.append({"question":question,"sources":selected_sources,"index":index})
177
+
178
+ # # Add the number of questions to search
179
+ # k_by_question = k_final // len(new_questions)
180
+ # for q in new_questions:
181
+ # q["k"] = k_by_question
182
+
183
+ # new_state["questions"] = new_questions
184
+ # new_state["remaining_questions"] = new_questions
185
+
186
+ new_state = {
187
+ "remaining_questions":new_questions,
188
+ "n_questions":len(new_questions),
189
+ }
190
+
191
+ return new_state
192
+
193
+ return transform_query
climateqa/engine/{reformulation.py → chains/reformulation.py} RENAMED
@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
3
  from langchain_core.prompts import PromptTemplate
4
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
5
 
6
- from climateqa.engine.prompts import reformulation_prompt_template
7
  from climateqa.engine.utils import pass_values, flatten_dict
8
 
9
 
 
3
  from langchain_core.prompts import PromptTemplate
4
  from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
5
 
6
+ from climateqa.engine.chains.prompts import reformulation_prompt_template
7
  from climateqa.engine.utils import pass_values, flatten_dict
8
 
9
 
climateqa/engine/chains/retrieve_documents.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from contextlib import contextmanager
4
+
5
+ from langchain_core.tools import tool
6
+ from langchain_core.runnables import chain
7
+ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
8
+ from langchain_core.runnables import RunnableLambda
9
+
10
+ from ..reranker import rerank_docs
11
+ from ...knowledge.retriever import ClimateQARetriever
12
+ from ...knowledge.openalex import OpenAlexRetriever
13
+ from .keywords_extraction import make_keywords_extraction_chain
14
+ from ..utils import log_event
15
+
16
+
17
+
18
+ def divide_into_parts(target, parts):
19
+ # Base value for each part
20
+ base = target // parts
21
+ # Remainder to distribute
22
+ remainder = target % parts
23
+ # List to hold the result
24
+ result = []
25
+
26
+ for i in range(parts):
27
+ if i < remainder:
28
+ # These parts get base value + 1
29
+ result.append(base + 1)
30
+ else:
31
+ # The rest get the base value
32
+ result.append(base)
33
+
34
+ return result
35
+
36
+
37
+ @contextmanager
38
+ def suppress_output():
39
+ # Open a null device
40
+ with open(os.devnull, 'w') as devnull:
41
+ # Store the original stdout and stderr
42
+ old_stdout = sys.stdout
43
+ old_stderr = sys.stderr
44
+ # Redirect stdout and stderr to the null device
45
+ sys.stdout = devnull
46
+ sys.stderr = devnull
47
+ try:
48
+ yield
49
+ finally:
50
+ # Restore stdout and stderr
51
+ sys.stdout = old_stdout
52
+ sys.stderr = old_stderr
53
+
54
+
55
+ @tool
56
+ def query_retriever(question):
57
+ """Just a dummy tool to simulate the retriever query"""
58
+ return question
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+ def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
67
+
68
+ # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
69
+ @chain
70
+ async def retrieve_documents(state,config):
71
+
72
+ keywords_extraction = make_keywords_extraction_chain(llm)
73
+
74
+ current_question = state["remaining_questions"][0]
75
+ remaining_questions = state["remaining_questions"][1:]
76
+
77
+ # ToolMessage(f"Retrieving documents for question: {current_question['question']}",tool_call_id = "retriever")
78
+
79
+
80
+ # # There are several options to get the final top k
81
+ # # Option 1 - Get 100 documents by question and rerank by question
82
+ # # Option 2 - Get 100/n documents by question and rerank the total
83
+ # if rerank_by_question:
84
+ # k_by_question = divide_into_parts(k_final,len(questions))
85
+
86
+ # docs = state["documents"]
87
+ # if docs is None: docs = []
88
+
89
+ docs = []
90
+ k_by_question = k_final // state["n_questions"]
91
+
92
+ sources = current_question["sources"]
93
+ question = current_question["question"]
94
+ index = current_question["index"]
95
+
96
+
97
+ await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
98
+
99
+
100
+ if index == "Vector":
101
+
102
+ # Search the document store using the retriever
103
+ # Configure high top k for further reranking step
104
+ retriever = ClimateQARetriever(
105
+ vectorstore=vectorstore,
106
+ sources = sources,
107
+ min_size = 200,
108
+ k_summary = k_summary,
109
+ k_total = k_before_reranking,
110
+ threshold = 0.5,
111
+ )
112
+ docs_question = await retriever.ainvoke(question,config)
113
+
114
+ elif index == "OpenAlex":
115
+
116
+ keywords = keywords_extraction.invoke(question)["keywords"]
117
+ openalex_query = " AND ".join(keywords)
118
+
119
+ print(f"... OpenAlex query: {openalex_query}")
120
+
121
+ retriever_openalex = OpenAlexRetriever(
122
+ min_year = state.get("min_year",1960),
123
+ max_year = state.get("max_year",None),
124
+ k = k_before_reranking
125
+ )
126
+ docs_question = await retriever_openalex.ainvoke(openalex_query,config)
127
+
128
+ else:
129
+ raise Exception(f"Index {index} not found in the routing index")
130
+
131
+ # Rerank
132
+ if reranker is not None:
133
+ with suppress_output():
134
+ docs_question = rerank_docs(reranker,docs_question,question)
135
+ else:
136
+ # Add a default reranking score
137
+ for doc in docs_question:
138
+ doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
139
+
140
+ # If rerank by question we select the top documents for each question
141
+ if rerank_by_question:
142
+ docs_question = docs_question[:k_by_question]
143
+
144
+ # Add sources used in the metadata
145
+ for doc in docs_question:
146
+ doc.metadata["sources_used"] = sources
147
+ doc.metadata["question_used"] = question
148
+ doc.metadata["index_used"] = index
149
+
150
+ # Add to the list of docs
151
+ docs.extend(docs_question)
152
+
153
+ # Sorting the list in descending order by rerank_score
154
+ docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
155
+ new_state = {"documents":docs,"remaining_questions":remaining_questions}
156
+ return new_state
157
+
158
+ return retrieve_documents
159
+
climateqa/engine/chains/sample_router.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # from typing import List
3
+ # from typing import Literal
4
+ # from langchain.prompts import ChatPromptTemplate
5
+ # from langchain_core.utils.function_calling import convert_to_openai_function
6
+ # from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
7
+
8
+ # # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
9
+
10
+ # class Location(BaseModel):
11
+ # country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
12
+ # location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
13
+
14
+ # class QueryAnalysis(BaseModel):
15
+ # """Analyzing the user query"""
16
+
17
+ # language: str = Field(
18
+ # description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
19
+ # )
20
+ # intent: str = Field(
21
+ # enum=[
22
+ # "Environmental impacts of AI",
23
+ # "Geolocated info about climate change",
24
+ # "Climate change",
25
+ # "Biodiversity",
26
+ # "Deep sea mining",
27
+ # "Chitchat",
28
+ # ],
29
+ # description="""
30
+ # Categorize the user query in one of the following category,
31
+
32
+ # Examples:
33
+ # - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
34
+ # - Climate change: "What is radiative forcing", "How much will
35
+ # """,
36
+ # )
37
+ # sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
38
+ # ...,
39
+ # description="""
40
+ # Given a user question choose which documents would be most relevant for answering their question,
41
+ # - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
42
+ # - IPBES is for questions about biodiversity and nature
43
+ # - IPOS is for questions about the ocean and deep sea mining
44
+
45
+ # """,
46
+ # )
47
+ # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
48
+ # location:Location
49
+ # # query: str = Field(
50
+ # # description = """
51
+ # # Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
52
+ # # The reformulated question will used in a search engine
53
+ # # By default, assume that the user is asking information about the last century,
54
+ # # Use the following examples
55
+
56
+ # # ### Examples:
57
+ # # La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
58
+ # # what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
59
+ # # what are the main causes of climate change? -> What are the main causes of climate change in the last century?
60
+
61
+ # # Question in English:
62
+ # # """
63
+ # # )
64
+
65
+ # openai_functions = [convert_to_openai_function(QueryAnalysis)]
66
+ # llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
climateqa/engine/chains/translation.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+ from typing import List
4
+ from typing import Literal
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain_core.utils.function_calling import convert_to_openai_function
7
+ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
8
+
9
+
10
+ class Translation(BaseModel):
11
+ """Analyzing the user message input"""
12
+
13
+ translation: str = Field(
14
+ description="Translate the message input to English",
15
+ )
16
+
17
+
18
+ def make_translation_chain(llm):
19
+
20
+ openai_functions = [convert_to_openai_function(Translation)]
21
+ llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
22
+
23
+ prompt = ChatPromptTemplate.from_messages([
24
+ ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
25
+ ("user", "input: {input}")
26
+ ])
27
+
28
+ chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
29
+ return chain
30
+
31
+
32
+ def make_translation_node(llm):
33
+
34
+ translation_chain = make_translation_chain(llm)
35
+
36
+ def translate_query(state):
37
+ user_input = state["user_input"]
38
+ translation = translation_chain.invoke({"input":user_input})
39
+ return {"query":translation["translation"]}
40
+
41
+ return translate_query
climateqa/engine/embeddings.py CHANGED
@@ -2,7 +2,7 @@
2
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
5
- def get_embeddings_function(version = "v1.2"):
6
 
7
  if version == "v1.2":
8
 
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2"):
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
 
12
  model_name = "BAAI/bge-base-en-v1.5"
13
- encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
14
  print("Loading embeddings model: ", model_name)
15
  embeddings_function = HuggingFaceBgeEmbeddings(
16
  model_name=model_name,
17
  encode_kwargs=encode_kwargs,
18
- query_instruction="Represent this sentence for searching relevant passages: "
19
  )
20
 
21
  else:
@@ -23,3 +23,6 @@ def get_embeddings_function(version = "v1.2"):
23
  embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
24
 
25
  return embeddings_function
 
 
 
 
2
  from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
5
+ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
6
 
7
  if version == "v1.2":
8
 
 
10
  # Best embedding model at a reasonable size at the moment (2023-11-22)
11
 
12
  model_name = "BAAI/bge-base-en-v1.5"
13
+ encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
14
  print("Loading embeddings model: ", model_name)
15
  embeddings_function = HuggingFaceBgeEmbeddings(
16
  model_name=model_name,
17
  encode_kwargs=encode_kwargs,
18
+ query_instruction=query_instruction,
19
  )
20
 
21
  else:
 
23
  embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
24
 
25
  return embeddings_function
26
+
27
+
28
+
climateqa/engine/graph.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from contextlib import contextmanager
4
+
5
+ from langchain.schema import Document
6
+ from langgraph.graph import END, StateGraph
7
+ from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
8
+
9
+ from typing_extensions import TypedDict
10
+ from typing import List
11
+
12
+ from IPython.display import display, HTML, Image
13
+
14
+ from .chains.answer_chitchat import make_chitchat_node
15
+ from .chains.answer_ai_impact import make_ai_impact_node
16
+ from .chains.query_transformation import make_query_transform_node
17
+ from .chains.translation import make_translation_node
18
+ from .chains.intent_categorization import make_intent_categorization_node
19
+ from .chains.retrieve_documents import make_retriever_node
20
+ from .chains.answer_rag import make_rag_node
21
+
22
+ class GraphState(TypedDict):
23
+ """
24
+ Represents the state of our graph.
25
+ """
26
+ user_input : str
27
+ language : str
28
+ intent : str
29
+ query: str
30
+ remaining_questions : List[dict]
31
+ n_questions : int
32
+ answer: str
33
+ audience: str = "experts"
34
+ sources_input: List[str] = ["IPCC","IPBES"]
35
+ sources_auto: bool = True
36
+ min_year: int = 1960
37
+ max_year: int = None
38
+ documents: List[Document]
39
+
40
+ def search(state): #TODO
41
+ return state
42
+
43
+ def answer_search(state):#TODO
44
+ return state
45
+
46
+ def route_intent(state):
47
+ intent = state["intent"]
48
+ if intent in ["chitchat","esg"]:
49
+ return "answer_chitchat"
50
+ # elif intent == "ai_impact":
51
+ # return "answer_ai_impact"
52
+ else:
53
+ # Search route
54
+ return "search"
55
+
56
+ def route_translation(state):
57
+ if state["language"].lower() == "english":
58
+ return "transform_query"
59
+ else:
60
+ return "translate_query"
61
+
62
+ def route_based_on_relevant_docs(state,threshold_docs=0.2):
63
+ docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
64
+ if len(docs) > 0:
65
+ return "answer_rag"
66
+ else:
67
+ return "answer_rag_no_docs"
68
+
69
+
70
+ def make_id_dict(values):
71
+ return {k:k for k in values}
72
+
73
+ def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
74
+
75
+ workflow = StateGraph(GraphState)
76
+
77
+ # Define the node functions
78
+ categorize_intent = make_intent_categorization_node(llm)
79
+ transform_query = make_query_transform_node(llm)
80
+ translate_query = make_translation_node(llm)
81
+ answer_chitchat = make_chitchat_node(llm)
82
+ answer_ai_impact = make_ai_impact_node(llm)
83
+ retrieve_documents = make_retriever_node(vectorstore,reranker,llm)
84
+ answer_rag = make_rag_node(llm,with_docs=True)
85
+ answer_rag_no_docs = make_rag_node(llm,with_docs=False)
86
+
87
+ # Define the nodes
88
+ workflow.add_node("categorize_intent", categorize_intent)
89
+ workflow.add_node("search", search)
90
+ workflow.add_node("answer_search", answer_search)
91
+ workflow.add_node("transform_query", transform_query)
92
+ workflow.add_node("translate_query", translate_query)
93
+ workflow.add_node("answer_chitchat", answer_chitchat)
94
+ # workflow.add_node("answer_ai_impact", answer_ai_impact)
95
+ workflow.add_node("retrieve_documents",retrieve_documents)
96
+ workflow.add_node("answer_rag",answer_rag)
97
+ workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
98
+
99
+ # Entry point
100
+ workflow.set_entry_point("categorize_intent")
101
+
102
+ # CONDITIONAL EDGES
103
+ workflow.add_conditional_edges(
104
+ "categorize_intent",
105
+ route_intent,
106
+ make_id_dict(["answer_chitchat","search"])
107
+ )
108
+
109
+ workflow.add_conditional_edges(
110
+ "search",
111
+ route_translation,
112
+ make_id_dict(["translate_query","transform_query"])
113
+ )
114
+ workflow.add_conditional_edges(
115
+ "retrieve_documents",
116
+ lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
117
+ make_id_dict(["retrieve_documents","answer_search"])
118
+ )
119
+
120
+ workflow.add_conditional_edges(
121
+ "answer_search",
122
+ lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
123
+ make_id_dict(["answer_rag","answer_rag_no_docs"])
124
+ )
125
+
126
+ # Define the edges
127
+ workflow.add_edge("translate_query", "transform_query")
128
+ workflow.add_edge("transform_query", "retrieve_documents")
129
+ workflow.add_edge("answer_rag", END)
130
+ workflow.add_edge("answer_rag_no_docs", END)
131
+ workflow.add_edge("answer_chitchat", END)
132
+ # workflow.add_edge("answer_ai_impact", END)
133
+
134
+ # Compile
135
+ app = workflow.compile()
136
+ return app
137
+
138
+
139
+
140
+
141
+ def display_graph(app):
142
+
143
+ display(
144
+ Image(
145
+ app.get_graph(xray = True).draw_mermaid_png(
146
+ draw_method=MermaidDrawMethod.API,
147
+ )
148
+ )
149
+ )
climateqa/engine/llm/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
  from climateqa.engine.llm.openai import get_llm as get_openai_llm
2
  from climateqa.engine.llm.azure import get_llm as get_azure_llm
 
3
 
4
 
5
  def get_llm(provider="openai",**kwargs):
@@ -8,6 +9,8 @@ def get_llm(provider="openai",**kwargs):
8
  return get_openai_llm(**kwargs)
9
  elif provider == "azure":
10
  return get_azure_llm(**kwargs)
 
 
11
  else:
12
  raise ValueError(f"Unknown provider: {provider}")
13
 
 
1
  from climateqa.engine.llm.openai import get_llm as get_openai_llm
2
  from climateqa.engine.llm.azure import get_llm as get_azure_llm
3
+ from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
4
 
5
 
6
  def get_llm(provider="openai",**kwargs):
 
9
  return get_openai_llm(**kwargs)
10
  elif provider == "azure":
11
  return get_azure_llm(**kwargs)
12
+ elif provider == "ollama":
13
+ return get_ollama_llm(**kwargs)
14
  else:
15
  raise ValueError(f"Unknown provider: {provider}")
16
 
climateqa/engine/llm/ollama.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+
3
+ from langchain_community.llms import Ollama
4
+
5
+ def get_llm(model="llama3", **kwargs):
6
+ return Ollama(model=model, **kwargs)
climateqa/engine/rag.py DELETED
@@ -1,134 +0,0 @@
1
- from operator import itemgetter
2
-
3
- from langchain_core.prompts import ChatPromptTemplate
4
- from langchain_core.output_parsers import StrOutputParser
5
- from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
6
- from langchain_core.prompts.prompt import PromptTemplate
7
- from langchain_core.prompts.base import format_document
8
-
9
- from climateqa.engine.reformulation import make_reformulation_chain
10
- from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
11
- from climateqa.engine.prompts import papers_prompt_template
12
- from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
13
- from climateqa.engine.keywords import make_keywords_chain
14
-
15
- DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
16
-
17
- def _combine_documents(
18
- docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
19
- ):
20
-
21
- doc_strings = []
22
-
23
- for i,doc in enumerate(docs):
24
- # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
25
- chunk_type = "Doc"
26
- if isinstance(doc,str):
27
- doc_formatted = doc
28
- else:
29
- doc_formatted = format_document(doc, document_prompt)
30
- doc_string = f"{chunk_type} {i+1}: " + doc_formatted
31
- doc_string = doc_string.replace("\n"," ")
32
- doc_strings.append(doc_string)
33
-
34
- return sep.join(doc_strings)
35
-
36
-
37
- def get_text_docs(x):
38
- return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
39
-
40
- def get_image_docs(x):
41
- return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
42
-
43
-
44
- def make_rag_chain(retriever,llm):
45
-
46
- # Construct the prompt
47
- prompt = ChatPromptTemplate.from_template(answer_prompt_template)
48
- prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
49
-
50
- # ------- CHAIN 0 - Reformulation
51
- reformulation = make_reformulation_chain(llm)
52
- reformulation = prepare_chain(reformulation,"reformulation")
53
-
54
- # ------- Find all keywords from the reformulated query
55
- keywords = make_keywords_chain(llm)
56
- keywords = {"keywords":itemgetter("question") | keywords}
57
- keywords = prepare_chain(keywords,"keywords")
58
-
59
- # ------- CHAIN 1
60
- # Retrieved documents
61
- find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
62
- find_documents = prepare_chain(find_documents,"find_documents")
63
-
64
- # ------- CHAIN 2
65
- # Construct inputs for the llm
66
- input_documents = {
67
- "context":lambda x : _combine_documents(x["docs"]),
68
- **pass_values(["question","audience","language","keywords"])
69
- }
70
-
71
- # ------- CHAIN 3
72
- # Bot answer
73
- llm_final = rename_chain(llm,"answer")
74
-
75
- answer_with_docs = {
76
- "answer": input_documents | prompt | llm_final | StrOutputParser(),
77
- **pass_values(["question","audience","language","query","docs","keywords"]),
78
- }
79
-
80
- answer_without_docs = {
81
- "answer": prompt_without_docs | llm_final | StrOutputParser(),
82
- **pass_values(["question","audience","language","query","docs","keywords"]),
83
- }
84
-
85
- # def has_images(x):
86
- # image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
87
- # return len(image_docs) > 0
88
-
89
- def has_docs(x):
90
- return len(x["docs"]) > 0
91
-
92
- answer = RunnableBranch(
93
- (lambda x: has_docs(x), answer_with_docs),
94
- answer_without_docs,
95
- )
96
-
97
-
98
- # ------- FINAL CHAIN
99
- # Build the final chain
100
- rag_chain = reformulation | keywords | find_documents | answer
101
-
102
- return rag_chain
103
-
104
-
105
- def make_rag_papers_chain(llm):
106
-
107
- prompt = ChatPromptTemplate.from_template(papers_prompt_template)
108
-
109
- input_documents = {
110
- "context":lambda x : _combine_documents(x["docs"]),
111
- **pass_values(["question","language"])
112
- }
113
-
114
- chain = input_documents | prompt | llm | StrOutputParser()
115
- chain = rename_chain(chain,"answer")
116
-
117
- return chain
118
-
119
-
120
-
121
-
122
-
123
-
124
- def make_illustration_chain(llm):
125
-
126
- prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
127
-
128
- input_description_images = {
129
- "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
130
- **pass_values(["question","audience","language","answer"]),
131
- }
132
-
133
- illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
134
- return illustration_chain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
climateqa/engine/reranker.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from scipy.special import expit, logit
3
+ from rerankers import Reranker
4
+
5
+
6
+ def get_reranker(model = "nano",cohere_api_key = None):
7
+
8
+ assert model in ["nano","tiny","small","large"]
9
+
10
+ if model == "nano":
11
+ reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
12
+ elif model == "tiny":
13
+ reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
14
+ elif model == "small":
15
+ reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
16
+ elif model == "large":
17
+ if cohere_api_key is None:
18
+ cohere_api_key = os.environ["COHERE_API_KEY"]
19
+ reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
20
+ return reranker
21
+
22
+
23
+
24
+ def rerank_docs(reranker,docs,query):
25
+
26
+ # Get a list of texts from langchain docs
27
+ input_docs = [x.page_content for x in docs]
28
+
29
+ # Rerank using rerankers library
30
+ results = reranker.rank(query=query, docs=input_docs)
31
+
32
+ # Prepare langchain list of docs
33
+ docs_reranked = []
34
+ for result in results.results:
35
+ doc_id = result.document.doc_id
36
+ doc = docs[doc_id]
37
+ doc.metadata["reranking_score"] = result.score
38
+ doc.metadata["query_used_for_retrieval"] = query
39
+ docs_reranked.append(doc)
40
+ return docs_reranked
climateqa/engine/utils.py CHANGED
@@ -1,8 +1,15 @@
1
  from operator import itemgetter
2
  from typing import Any, Dict, Iterable, Tuple
 
3
  from langchain_core.runnables import RunnablePassthrough
4
 
5
 
 
 
 
 
 
 
6
  def pass_values(x):
7
  if not isinstance(x, list):
8
  x = [x]
@@ -67,3 +74,13 @@ def flatten_dict(
67
  """
68
  flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
69
  return flat_dict
 
 
 
 
 
 
 
 
 
 
 
1
  from operator import itemgetter
2
  from typing import Any, Dict, Iterable, Tuple
3
+ import tiktoken
4
  from langchain_core.runnables import RunnablePassthrough
5
 
6
 
7
+ def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
8
+ encoding = tiktoken.get_encoding(encoding_name)
9
+ num_tokens = len(encoding.encode(string))
10
+ return num_tokens
11
+
12
+
13
  def pass_values(x):
14
  if not isinstance(x, list):
15
  x = [x]
 
74
  """
75
  flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
76
  return flat_dict
77
+
78
+
79
+
80
+ async def log_event(info,name,config):
81
+ """Helper function that will run a dummy chain with the given info
82
+ The astream_event function will catch this chain and stream the dict info to the logger
83
+ """
84
+
85
+ chain = RunnablePassthrough().with_config(run_name=name)
86
+ _ = await chain.ainvoke(info,config)
climateqa/knowledge/__init__.py ADDED
File without changes
climateqa/{papers → knowledge}/openalex.py RENAMED
@@ -3,18 +3,32 @@ import networkx as nx
3
  import matplotlib.pyplot as plt
4
  from pyvis.network import Network
5
 
 
 
 
 
 
 
 
 
 
 
 
6
  from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
7
  import pyalex
8
 
9
  pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
10
 
 
 
 
 
11
  class OpenAlex():
12
  def __init__(self):
13
  pass
14
 
15
 
16
-
17
- def search(self,keywords,n_results = 100,after = None,before = None):
18
 
19
  if isinstance(keywords,str):
20
  works = Works().search(keywords)
@@ -27,18 +41,21 @@ class OpenAlex():
27
  break
28
 
29
  df_works = pd.DataFrame(page)
30
- df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
 
 
31
  df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
32
  df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
33
- df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
34
-
 
 
 
 
 
 
35
  else:
36
- df_works = []
37
- for keyword in keywords:
38
- df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
39
- df_works.append(df_keyword)
40
- df_works = pd.concat(df_works,ignore_index=True,axis = 0)
41
- return df_works
42
 
43
 
44
  def rerank(self,query,df,reranker):
@@ -139,4 +156,36 @@ class OpenAlex():
139
  reconstructed[position] = token
140
 
141
  # Join the tokens to form the reconstructed sentence(s)
142
- return ' '.join(reconstructed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import matplotlib.pyplot as plt
4
  from pyvis.network import Network
5
 
6
+ from langchain_core.retrievers import BaseRetriever
7
+ from langchain_core.vectorstores import VectorStoreRetriever
8
+ from langchain_core.documents.base import Document
9
+ from langchain_core.vectorstores import VectorStore
10
+ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
11
+
12
+ from ..engine.utils import num_tokens_from_string
13
+
14
+ from typing import List
15
+ from pydantic import Field
16
+
17
  from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
18
  import pyalex
19
 
20
  pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
21
 
22
+
23
+ def replace_nan_with_empty_dict(x):
24
+ return x if pd.notna(x) else {}
25
+
26
  class OpenAlex():
27
  def __init__(self):
28
  pass
29
 
30
 
31
+ def search(self,keywords:str,n_results = 100,after = None,before = None):
 
32
 
33
  if isinstance(keywords,str):
34
  works = Works().search(keywords)
 
41
  break
42
 
43
  df_works = pd.DataFrame(page)
44
+ df_works = df_works.dropna(subset = ["title"])
45
+ df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
46
+ df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
47
  df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
48
  df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
49
+ df_works["url"] = df_works["id"]
50
+ df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
51
+ df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
52
+
53
+ df_works = df_works.drop(columns = ["abstract_inverted_index"])
54
+ # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
55
+
56
+ return df_works
57
  else:
58
+ raise Exception("Keywords must be a string")
 
 
 
 
 
59
 
60
 
61
  def rerank(self,query,df,reranker):
 
156
  reconstructed[position] = token
157
 
158
  # Join the tokens to form the reconstructed sentence(s)
159
+ return ' '.join(reconstructed)
160
+
161
+
162
+
163
+ class OpenAlexRetriever(BaseRetriever):
164
+ min_year:int = 1960
165
+ max_year:int = None
166
+ k:int = 100
167
+
168
+ def _get_relevant_documents(
169
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
170
+ ) -> List[Document]:
171
+
172
+ openalex = OpenAlex()
173
+
174
+ # Search for documents
175
+ df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)
176
+
177
+ docs = []
178
+ for i,row in df_docs.iterrows():
179
+ num_tokens = row["num_tokens"]
180
+
181
+ if num_tokens < 50 or num_tokens > 1000:
182
+ continue
183
+
184
+ doc = Document(
185
+ page_content = row["content"],
186
+ metadata = row.to_dict()
187
+ )
188
+ docs.append(doc)
189
+ return docs
190
+
191
+
climateqa/{engine → knowledge}/retriever.py RENAMED
@@ -66,6 +66,7 @@ class ClimateQARetriever(BaseRetriever):
66
  # Add score to metadata
67
  results = []
68
  for i,(doc,score) in enumerate(docs):
 
69
  doc.metadata["similarity_score"] = score
70
  doc.metadata["content"] = doc.page_content
71
  doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
@@ -78,86 +79,3 @@ class ClimateQARetriever(BaseRetriever):
78
  return results
79
 
80
 
81
-
82
-
83
- # def filter_summaries(df,k_summary = 3,k_total = 10):
84
- # # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
85
-
86
- # # # Filter by source
87
- # # if source == "IPCC":
88
- # # df = df.loc[df["source"]=="IPCC"]
89
- # # elif source == "IPBES":
90
- # # df = df.loc[df["source"]=="IPBES"]
91
- # # else:
92
- # # pass
93
-
94
- # # Separate summaries and full reports
95
- # df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
96
- # df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
97
-
98
- # # Find passages from summaries dataset
99
- # passages_summaries = df_summaries.head(k_summary)
100
-
101
- # # Find passages from full reports dataset
102
- # passages_fullreports = df_full.head(k_total - len(passages_summaries))
103
-
104
- # # Concatenate passages
105
- # passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
106
- # return passages
107
-
108
-
109
-
110
-
111
- # def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
112
- # assert max_k > k_total
113
-
114
- # validated_sources = ["IPCC","IPBES"]
115
- # sources = [x for x in sources if x in validated_sources]
116
- # filters = {
117
- # "source": { "$in": sources },
118
- # }
119
- # print(filters)
120
-
121
- # # Retrieve documents
122
- # docs = retriever.retrieve(query,top_k = max_k,filters = filters)
123
-
124
- # # Filter by score
125
- # docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
126
-
127
- # if len(docs) == 0:
128
- # return []
129
- # res = pd.DataFrame(docs)
130
- # passages_df = filter_summaries(res,k_summary,k_total)
131
- # if as_dict:
132
- # contents = passages_df["content"].tolist()
133
- # meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
134
- # passages = []
135
- # for i in range(len(contents)):
136
- # passages.append({"content":contents[i],"meta":meta[i]})
137
- # return passages
138
- # else:
139
- # return passages_df
140
-
141
-
142
-
143
- # def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
144
-
145
-
146
- # print("hellooooo")
147
-
148
- # # Reformulate queries
149
- # reformulated_query,language = reformulate(query)
150
-
151
- # print(reformulated_query)
152
-
153
- # # Retrieve documents
154
- # passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
155
- # response = {
156
- # "query":query,
157
- # "reformulated_query":reformulated_query,
158
- # "language":language,
159
- # "sources":passages,
160
- # "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
161
- # }
162
- # return response
163
-
 
66
  # Add score to metadata
67
  results = []
68
  for i,(doc,score) in enumerate(docs):
69
+ doc.page_content = doc.page_content.replace("\r\n"," ")
70
  doc.metadata["similarity_score"] = score
71
  doc.metadata["content"] = doc.page_content
72
  doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
 
79
  return results
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
climateqa/papers/__init__.py DELETED
@@ -1,43 +0,0 @@
1
- import pandas as pd
2
-
3
- from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
4
- import pyalex
5
-
6
- pyalex.config.email = "theo.alvesdacosta@ekimetrics.com"
7
-
8
- class OpenAlex():
9
- def __init__(self):
10
- pass
11
-
12
-
13
-
14
- def search(self,keywords,n_results = 100,after = None,before = None):
15
- works = Works().search(keywords).get()
16
-
17
- for page in works.paginate(per_page=n_results):
18
- break
19
-
20
- df_works = pd.DataFrame(page)
21
-
22
- return works
23
-
24
-
25
- def make_network(self):
26
- pass
27
-
28
-
29
- def get_abstract_from_inverted_index(self,index):
30
-
31
- # Determine the maximum index to know the length of the reconstructed array
32
- max_index = max([max(positions) for positions in index.values()])
33
-
34
- # Initialize a list with placeholders for all positions
35
- reconstructed = [''] * (max_index + 1)
36
-
37
- # Iterate through the inverted index and place each token at its respective position(s)
38
- for token, positions in index.items():
39
- for position in positions:
40
- reconstructed[position] = token
41
-
42
- # Join the tokens to form the reconstructed sentence(s)
43
- return ' '.join(reconstructed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
front/__init__.py ADDED
File without changes
front/callbacks.py ADDED
File without changes
front/utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+
4
+ def make_pairs(lst):
5
+ """from a list of even lenght, make tupple pairs"""
6
+ return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
7
+
8
+
9
+ def serialize_docs(docs):
10
+ new_docs = []
11
+ for doc in docs:
12
+ new_doc = {}
13
+ new_doc["page_content"] = doc.page_content
14
+ new_doc["metadata"] = doc.metadata
15
+ new_docs.append(new_doc)
16
+ return new_docs
17
+
18
+
19
+
20
+ def parse_output_llm_with_sources(output):
21
+ # Split the content into a list of text and "[Doc X]" references
22
+ content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
23
+ parts = []
24
+ for part in content_parts:
25
+ if part.startswith("Doc"):
26
+ subparts = part.split(",")
27
+ subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
28
+ subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
29
+ parts.append("".join(subparts))
30
+ else:
31
+ parts.append(part)
32
+ content_parts = "".join(parts)
33
+ return content_parts
34
+
35
+
36
+ def make_html_source(source,i):
37
+ meta = source.metadata
38
+ # content = source.page_content.split(":",1)[1].strip()
39
+ content = source.page_content.strip()
40
+
41
+ toc_levels = []
42
+ for j in range(2):
43
+ level = meta[f"toc_level{j}"]
44
+ if level != "N/A":
45
+ toc_levels.append(level)
46
+ else:
47
+ break
48
+ toc_levels = " > ".join(toc_levels)
49
+
50
+ if len(toc_levels) > 0:
51
+ name = f"<b>{toc_levels}</b><br/>{meta['name']}"
52
+ else:
53
+ name = meta['name']
54
+
55
+ score = meta['reranking_score']
56
+ if score > 0.8:
57
+ color = "score-green"
58
+ elif score > 0.4:
59
+ color = "score-orange"
60
+ else:
61
+ color = "score-red"
62
+
63
+ relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
64
+
65
+ if meta["chunk_type"] == "text":
66
+
67
+ card = f"""
68
+ <div class="card" id="doc{i}">
69
+ <div class="card-content">
70
+ <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
71
+ <p>{content}</p>
72
+ {relevancy_score}
73
+ </div>
74
+ <div class="card-footer">
75
+ <span>{name}</span>
76
+ <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
77
+ <span role="img" aria-label="Open PDF">🔗</span>
78
+ </a>
79
+ </div>
80
+ </div>
81
+ """
82
+
83
+ else:
84
+
85
+ if meta["figure_code"] != "N/A":
86
+ title = f"{meta['figure_code']} - {meta['short_name']}"
87
+ else:
88
+ title = f"{meta['short_name']}"
89
+
90
+ card = f"""
91
+ <div class="card card-image">
92
+ <div class="card-content">
93
+ <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
94
+ <p>{content}</p>
95
+ <p class='ai-generated'>AI-generated description</p>
96
+ {relevancy_score}
97
+ </div>
98
+ <div class="card-footer">
99
+ <span>{name}</span>
100
+ <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
101
+ <span role="img" aria-label="Open PDF">🔗</span>
102
+ </a>
103
+ </div>
104
+ </div>
105
+ """
106
+
107
+ return card
108
+
109
+
110
+
111
+ def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"):
112
+
113
+ if checked:
114
+ span = "<span class='checkmark'>&#10003;</span>"
115
+ else:
116
+ span = "<span class='loader'></span>"
117
+
118
+ # toolbox = f"""
119
+ # <div class="dropdown">
120
+ # <label for="{elem_id}" class="dropdown-toggle">
121
+ # {span}
122
+ # {tool_name}
123
+ # <span class="caret"></span>
124
+ # </label>
125
+ # <input type="checkbox" id="{elem_id}" hidden/>
126
+ # <div class="dropdown-content">
127
+ # <p>{description}</p>
128
+ # </div>
129
+ # </div>
130
+ # """
131
+
132
+
133
+ toolbox = f"""
134
+ <div class="dropdown">
135
+ <label for="{elem_id}" class="dropdown-toggle">
136
+ {span}
137
+ {tool_name}
138
+ </label>
139
+ </div>
140
+ """
141
+
142
+ return toolbox
requirements.txt CHANGED
@@ -1,13 +1,19 @@
1
- gradio==4.19.1
2
  azure-storage-file-share==12.11.1
3
  azure-storage-blob
4
  python-dotenv==1.0.0
5
- langchain==0.1.4
6
- langchain_openai==0.0.6
7
- pinecone-client==3.0.2
 
8
  sentence-transformers==2.6.0
9
  huggingface-hub
10
- msal
11
  pyalex==0.13
12
  networkx==3.2.1
13
- pyvis==0.3.2
 
 
 
 
 
 
 
1
+ gradio==4.44
2
  azure-storage-file-share==12.11.1
3
  azure-storage-blob
4
  python-dotenv==1.0.0
5
+ langchain==0.2.1
6
+ langchain_openai==0.1.7
7
+ langgraph==0.0.55
8
+ pinecone-client==4.1.0
9
  sentence-transformers==2.6.0
10
  huggingface-hub
 
11
  pyalex==0.13
12
  networkx==3.2.1
13
+ pyvis==0.3.2
14
+ flashrank==0.2.5
15
+ rerankers==0.3.0
16
+ torch==2.3.0
17
+ nvidia-cudnn-cu12==8.9.2.26
18
+ langchain-community==0.2
19
+ msal==1.31
sandbox/20240310 - CQA - Semantic Routing 1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
style.css CHANGED
@@ -2,6 +2,14 @@
2
  /* :root {
3
  --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
  } */
 
 
 
 
 
 
 
 
5
 
6
  .warning-box {
7
  background-color: #fff3cd;
@@ -57,6 +65,7 @@ body.dark .tip-box * {
57
 
58
  .message{
59
  font-size:14px !important;
 
60
  }
61
 
62
 
@@ -65,6 +74,10 @@ a {
65
  color: inherit;
66
  }
67
 
 
 
 
 
68
  .card {
69
  background-color: white;
70
  border-radius: 10px;
@@ -363,3 +376,108 @@ span.chatbot > p > img{
363
  .a-doc-ref{
364
  text-decoration: none !important;
365
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  /* :root {
3
  --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
4
  } */
5
+ .avatar-container.svelte-1x5p6hu:not(.thumbnail-item) img {
6
+ width: 100%;
7
+ height: 100%;
8
+ object-fit: cover;
9
+ border-radius: 50%;
10
+ padding: 0px;
11
+ margin: 0px;
12
+ }
13
 
14
  .warning-box {
15
  background-color: #fff3cd;
 
65
 
66
  .message{
67
  font-size:14px !important;
68
+
69
  }
70
 
71
 
 
74
  color: inherit;
75
  }
76
 
77
+ .doc-ref sup{
78
+ color:#dc2626!important;
79
+ /* margin-right:1px; */
80
+ }
81
  .card {
82
  background-color: white;
83
  border-radius: 10px;
 
376
  .a-doc-ref{
377
  text-decoration: none !important;
378
  }
379
+
380
+
381
+ .dropdown {
382
+ position: relative;
383
+ display:inline-block;
384
+ margin-bottom: 10px;
385
+ }
386
+
387
+ .dropdown-toggle {
388
+ background-color: #f2f2f2;
389
+ color: black;
390
+ padding: 10px;
391
+ font-size: 16px;
392
+ cursor: pointer;
393
+ display: block;
394
+ width: 400px; /* Adjust width as needed */
395
+ position: relative;
396
+ display: flex;
397
+ align-items: center; /* Vertically center the contents */
398
+ justify-content: left;
399
+ }
400
+
401
+ .dropdown-toggle .caret {
402
+ content: "";
403
+ position: absolute;
404
+ right: 10px;
405
+ top: 50%;
406
+ border-left: 5px solid transparent;
407
+ border-right: 5px solid transparent;
408
+ border-top: 5px solid black;
409
+ transform: translateY(-50%);
410
+ }
411
+
412
+ input[type="checkbox"] {
413
+ display: none !important;
414
+ }
415
+
416
+ input[type="checkbox"]:checked + .dropdown-content {
417
+ display: block;
418
+ }
419
+
420
+ .dropdown-content {
421
+ display: none;
422
+ position: absolute;
423
+ background-color: #f9f9f9;
424
+ min-width: 300px;
425
+ box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
426
+ z-index: 1;
427
+ padding: 12px;
428
+ border: 1px solid #ccc;
429
+ }
430
+
431
+ input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
432
+ display: block;
433
+ }
434
+
435
+ input[type="checkbox"]:checked + .dropdown-toggle .caret {
436
+ border-top: 0;
437
+ border-bottom: 5px solid black;
438
+ }
439
+
440
+ .loader {
441
+ border: 1px solid #d0d0d0 !important; /* Light grey background */
442
+ border-top: 1px solid #db3434 !important; /* Blue color */
443
+ border-right: 1px solid #3498db !important; /* Blue color */
444
+ border-radius: 50%;
445
+ width: 20px;
446
+ height: 20px;
447
+ animation: spin 2s linear infinite;
448
+ display:inline-block;
449
+ margin-right:10px !important;
450
+ }
451
+
452
+ .checkmark{
453
+ color:green !important;
454
+ font-size:18px;
455
+ margin-right:10px !important;
456
+ }
457
+
458
+ @keyframes spin {
459
+ 0% { transform: rotate(0deg); }
460
+ 100% { transform: rotate(360deg); }
461
+ }
462
+
463
+
464
+ .relevancy-score{
465
+ margin-top:10px !important;
466
+ font-size:10px !important;
467
+ font-style:italic;
468
+ }
469
+
470
+ .score-green{
471
+ color:green !important;
472
+ }
473
+
474
+ .score-orange{
475
+ color:orange !important;
476
+ }
477
+
478
+ .score-orange{
479
+ color:red !important;
480
+ }
481
+ .message-buttons-left.panel.message-buttons.with-avatar {
482
+ display: none;
483
+ }
test.json ADDED
File without changes