import gradio as gr
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
import openai
import pandas as pd
import os
from utils import (
make_pairs,
set_openai_api_key,
create_user_id,
to_completion,
)
from datetime import datetime
# from azure.storage.fileshare import ShareServiceClient
try:
from dotenv import load_dotenv
load_dotenv()
except:
pass
theme = gr.themes.Soft(
primary_hue="sky",
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
)
init_prompt = (
"TKOQA, an AI Assistant for Tikehau. "
)
sources_prompt = (
"When relevant, use facts and numbers from the following documents in your answer. "
)
def get_reformulation_prompt(query: str) -> str:
return f"""Reformulate the following user message to be a short standalone question in English, in the context of the Universal Registration Document of Tikehau .
---
query: what is the AUM of Tikehau in 2022?
standalone question: What is the AUM of TIkehau in 2022?
language: English
---
query: what is T2?
standalone question: what is the transition energy fund at Tikehau?
language: English
---
query: what is the business of Tikehau?
standalone question: What are the main business units of Tikehau?
language: English
---
query: {query}
standalone question:"""
system_template = {
"role": "system",
"content": init_prompt,
}
openai.api_key = os.environ["OPENAI_API_KEY"]
# BHO
# openai.api_base = os.environ["ressource_endpoint"]
# openai.api_version = "2022-12-01"
document_store = FAISSDocumentStore()
ds = FAISSDocumentStore.load(index_path="./tko_urd.faiss", config_path="./tko_urd.json",)
retriever = EmbeddingRetriever(
document_store=ds,
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
model_format="sentence_transformers",
progress_bar=False,
)
# retrieve_giec = EmbeddingRetriever(
# document_store=FAISSDocumentStore.load(
# index_path="./documents/climate_gpt_v2_only_giec.faiss",
# config_path="./documents/climate_gpt_v2_only_giec.json",
# ),
# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
# model_format="sentence_transformers",
# )
# BHO
# For Azure connection in secrets in HuggingFace
# credential = {
# "account_key": os.environ["account_key"],
# "account_name": os.environ["account_name"],
# }
# BHO
# account_url = os.environ["account_url"]
# file_share_name = "climategpt"
# service = ShareServiceClient(account_url=account_url, credential=credential)
# share_client = service.get_share_client(file_share_name)
user_id = create_user_id(10)
def filter_sources(df, k_summary=3, k_total=10, source="ipcc"):
assert source in ["ipcc", "ipbes", "all"]
# Filter by source
if source == "ipcc":
df = df.loc[df["source"] == "IPCC"]
elif source == "ipbes":
df = df.loc[df["source"] == "IPBES"]
else:
pass
# Prepare summaries
df_summaries = df #.loc[df.loc.obj.values]
# Separate summaries and full reports
#df_summaries = df.loc[df["report_type"].isin(["SPM", "TS"])]
#df_full = df.loc[~df["report_type"].isin(["SPM", "TS"])]
# Find passages from summaries dataset
passages_summaries = df_summaries.head(k_summary)
# Find passages from full reports dataset
# passages_fullreports = df_full.head(k_total - len(passages_summaries))
# Concatenate passages
#passages = pd.concat([passages_summaries, passages_fullreports], axis=0, ignore_index=True)
passages = passages_summaries
return passages
def retrieve_with_summaries(query, retriever, k_summary=3, k_total=10, source="ipcc", max_k=100, threshold=0.555,
as_dict=True):
assert max_k > k_total
docs = retriever.retrieve(query, top_k=max_k)
docs = [{**x.meta, "score": x.score, "content": x.content} for x in docs if x.score > threshold]
if len(docs) == 0:
return []
res = pd.DataFrame(docs)
passages_df = filter_sources(res, k_summary, k_total, source)
if as_dict:
contents = passages_df["content"].tolist()
meta = passages_df.drop(columns=["content"]).to_dict(orient="records")
passages = []
for i in range(len(contents)):
passages.append({"content": contents[i], "meta": meta[i]})
return passages
else:
return passages_df
def make_html_source(source, i):
meta = source['meta']
return f"""
Doc {i} - {meta['file_name']} - Page {meta['page_number']}
{source['content']}
"""
def chat(
user_id: str,
query: str,
history: list = [system_template],
report_type: str = "All available",
threshold: float = 0.555,
) -> tuple:
"""retrieve relevant documents in the document store then query gpt-turbo
Args:
query (str): user message.
history (list, optional): history of the conversation. Defaults to [system_template].
report_type (str, optional): should be "All available" or "IPCC only". Defaults to "All available".
threshold (float, optional): similarity threshold, don't increase more than 0.568. Defaults to 0.56.
Yields:
tuple: chat gradio format, chat openai format, sources used.
"""
if report_type not in ["IPCC", "IPBES"]: report_type = "all"
print("Searching in ", report_type, " reports")
reformulated_query = openai.Completion.create(
engine="text-davinci-003",
prompt=get_reformulation_prompt(query),
temperature=0,
max_tokens=128,
stop=["\n---\n", "<|im_end|>"],
)
reformulated_query = reformulated_query["choices"][0]["text"]
reformulated_query, language = reformulated_query.split("\n")
language = language.split(":")[1].strip()
sources = retrieve_with_summaries(reformulated_query, retriever, k_total=10, k_summary=3, as_dict=True,
source=report_type.lower(), threshold=threshold)
response_retriever = {
"language": language,
"reformulated_query": reformulated_query,
"query": query,
"sources": sources,
}
# docs = [d for d in retriever.retrieve(query=reformulated_query, top_k=10) if d.score > threshold]
messages = history + [{"role": "user", "content": query}]
if len(sources) > 0:
docs_string = []
docs_html = []
for i, d in enumerate(sources, 1):
#docs_string.append(f"📃 Doc {i}: {d['meta']['short_name']} page {d['meta']['page_number']}\n{d['content']}")
docs_string.append(f"📃 Doc {i}: {d['meta']['file_name']} page {d['meta']['page_number']}\n{d['content']}")
docs_html.append(make_html_source(d, i))
docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
docs_html = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_html)
messages.append({"role": "system", "content": f"{sources_prompt}\n\n{docs_string}\n\nAnswer in {language}:"})
response = openai.Completion.create(
# engine="climateGPT",
engine="text-davinci-003",
prompt=to_completion(messages),
temperature=0, # deterministic
stream=True,
max_tokens=1024,
)
complete_response = ""
messages.pop()
messages.append({"role": "assistant", "content": complete_response})
timestamp = str(datetime.now().timestamp())
file = user_id[0] + timestamp + ".json"
logs = {
"user_id": user_id[0],
"prompt": query,
"retrived": sources,
"report_type": report_type,
"prompt_eng": messages[0],
"answer": messages[-1]["content"],
"time": timestamp,
}
# log_on_azure(file, logs, share_client)
print(logs)
for chunk in response:
if (chunk_message := chunk["choices"][0].get("text")) and chunk_message != "<|im_end|>":
complete_response += chunk_message
messages[-1]["content"] = complete_response
gradio_format = make_pairs([a["content"] for a in messages[1:]])
yield gradio_format, messages, docs_html
else:
docs_string = "⚠️ No relevant passages found in the URDs"
complete_response = "**⚠️ No relevant passages found in the URDs **"
messages.append({"role": "assistant", "content": complete_response})
gradio_format = make_pairs([a["content"] for a in messages[1:]])
yield gradio_format, messages, docs_string
def save_feedback(feed: str, user_id):
if len(feed) > 1:
timestamp = str(datetime.now().timestamp())
file = user_id[0] + timestamp + ".json"
logs = {
"user_id": user_id[0],
"feedback": feed,
"time": timestamp,
}
# log_on_azure(file, logs, share_client)
print(logs)
return "Feedback submitted, thank you!"
def reset_textbox():
return gr.update(value="")
# def log_on_azure(file, logs, share_client):
# file_client = share_client.get_file_client(file)
# file_client.upload_file(str(logs))
with gr.Blocks(title="TKO URD Q&A", css="style.css", theme=theme) as demo:
user_id_state = gr.State([user_id])
# Gradio
gr.Markdown("Tikehau Capital Q&A
")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(elem_id="chatbot", label=" Tikehau Capital Q&A chatbot", show_label=False)
state = gr.State([system_template])
with gr.Row():
ask = gr.Textbox(
show_label=True,
placeholder="Ask here your Tikehau-related question and press enter",
).style(container=False)
#ask_examples_hidden = gr.Textbox(elem_id="hidden-message")
# examples_questions = gr.Examples(
# [
# "What is the AUM of Tikehau in 2022?",
# ],
# [ask_examples_hidden],
# examples_per_page=15,
#)
with gr.Column(scale=1, variant="panel"):
gr.Markdown("### Sources")
sources_textbox = gr.Markdown(show_label=False)
# dropdown_sources = gr.inputs.Dropdown(
# ["IPCC", "IPBES", "ALL"],
# default="ALL",
# label="Select reports",
# )
dropdown_sources = gr.State(["All"])
ask.submit(
fn=chat,
inputs=[
user_id_state,
ask,
state,
dropdown_sources
],
outputs=[chatbot, state, sources_textbox],
)
ask.submit(reset_textbox, [], [ask])
# ask_examples_hidden.change(
# fn=chat,
# inputs=[
# user_id_state,
# ask_examples_hidden,
# state,
# dropdown_sources
# ],
# outputs=[chatbot, state, sources_textbox],
# )
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"""
Version 0.1-beta - This tool is under active development
"""
)
with gr.Column(scale=1):
gr.Markdown("*Source : Tikehau Universal Registration Documents *")
gr.Markdown("## How to use TKO URD Q&A")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(
"""
### 💪 Getting started
- In the chatbot section, simply type your Tikehau-related question, answers will be provided with references to relevant URDs.
"""
)
with gr.Column(scale=1):
gr.Markdown(
"""
### ⚠️ Limitations
- Please note that, like any AI, the model may occasionally generate an inaccurate or imprecise answer.
"""
)
gr.Markdown("## 🙏 Feedback and feature requests")
gr.Markdown(
"""
### Beta test
- Feedback welcome. Inspired from the Climate tool by Ekimetrics.
"""
)
gr.Markdown(
"""
## 🛢️ Carbon Footprint
Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)
| Phase | Description | Emissions | Source |
| --- | --- | --- | --- |
| Inference | API call to turbo-GPT | ~0.38gCO2e / call | https://medium.com/@chrispointon/the-carbon-footprint-of-chatgpt-e1bc14e4cc2a |
Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to ClimateQ&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)
Or around 2 to 4 times more than a typical Google search.
.
"""
)
demo.queue(concurrency_count=16)
demo.launch()