Theo Alves Da Costa commited on
Commit
46e3999
1 Parent(s): 6c0b6e5

Safety check on non sourced answers + theme

Browse files
Files changed (4) hide show
  1. .gitignore +4 -0
  2. app.py +80 -45
  3. requirements.txt +2 -1
  4. style.css +2 -7
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ __pycache__/app.cpython-38.pyc
3
+ __pycache__/app.cpython-39.pyc
4
+ __pycache__/utils.cpython-38.pyc
app.py CHANGED
@@ -13,6 +13,21 @@ import numpy as np
13
  from datetime import datetime
14
  from azure.storage.fileshare import ShareServiceClient
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  system_template = {"role": "system", "content": os.environ["content"]}
18
 
@@ -44,10 +59,14 @@ credential = {
44
  "account_name": os.environ["account_name"],
45
  }
46
 
47
- account_url = os.environ["account_url"]
48
- file_share_name = "climategpt"
49
- service = ShareServiceClient(account_url=account_url, credential=credential)
50
- share_client = service.get_share_client(file_share_name)
 
 
 
 
51
  user_id = create_user_id(10)
52
 
53
 
@@ -55,7 +74,7 @@ def chat(
55
  user_id: str,
56
  query: str,
57
  history: list = [system_template],
58
- report_type: str = "All available",
59
  threshold: float = 0.555,
60
  ) -> tuple:
61
  """retrieve relevant documents in the document store then query gpt-turbo
@@ -81,7 +100,7 @@ def chat(
81
 
82
  messages = history + [{"role": "user", "content": query}]
83
  sources = "\n\n".join(
84
- f"doc {i}: {d.meta['file_name']} page {d.meta['page_number']}\n{d.content}"
85
  for i, d in enumerate(docs, 1)
86
  if d.score > threshold
87
  )
@@ -91,43 +110,54 @@ def chat(
91
  {"role": "system", "content": f"{os.environ['sources']}\n\n{sources}"}
92
  )
93
 
94
- response = openai.Completion.create(
95
- engine="climateGPT",
96
- prompt=to_completion(messages),
97
- temperature=0.2,
98
- stream=True,
99
- max_tokens=1024,
100
- )
101
 
102
- if sources:
103
  complete_response = ""
104
  messages.pop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  else:
106
- sources = "No climate science report was used to provide this answer."
107
- complete_response = "**⚠️ No relevant passages found in the climate science reports, for a sourced answer you may want to try a more specific question (specifying your question on climate issues). The answer will probably reasonable, but not sourced on the IPCC, please use the following results with caution.**\n\n"
108
-
109
- messages.append({"role": "assistant", "content": complete_response})
110
- timestamp = str(datetime.now().timestamp())
111
- file = user_id[0] + timestamp + ".json"
112
- logs = {
113
- "user_id": user_id[0],
114
- "prompt": query,
115
- "retrived": sources,
116
- "report_type": report_type,
117
- "prompt_eng": messages[0],
118
- "answer": messages[-1]["content"],
119
- "time": timestamp,
120
- }
121
- log_on_azure(file, logs, share_client)
122
-
123
- for chunk in response:
124
- if (
125
- chunk_message := chunk["choices"][0].get("text")
126
- ) and chunk_message != "<|im_end|>":
127
- complete_response += chunk_message
128
- messages[-1]["content"] = complete_response
129
- gradio_format = make_pairs([a["content"] for a in messages[1:]])
130
- yield gradio_format, messages, sources
131
 
132
 
133
  def save_feedback(feed: str, user_id):
@@ -152,7 +182,7 @@ def log_on_azure(file, logs, share_client):
152
  file_client.upload_file(str(logs))
153
 
154
 
155
- with gr.Blocks(title="🌍 Climate Q&A", css="style.css") as demo:
156
 
157
  user_id_state = gr.State([user_id])
158
 
@@ -166,15 +196,20 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css") as demo:
166
  gr.Markdown(
167
  """
168
  <p><b>Climate change and environmental disruptions have become some of the most pressing challenges facing our planet today</b>. As global temperatures rise and ecosystems suffer, it is essential for individuals to understand the gravity of the situation in order to make informed decisions and advocate for appropriate policy changes.</p>
169
- <p>However, comprehending the vast and complex scientific information can be daunting, as the scientific consensus references, such as <b>the Intergovernmental Panel on Climate Change (IPCC) reports, span thousands of pages</b> and are often laden with technical jargon. To bridge this gap and make climate science more accessible, we introduce <b>ClimateQ&A as a tool to distill expert-level knowledge into easily digestible insights about climate science.</b></p>
170
  <div class="tip-box">
171
  <div class="tip-box-title">
172
  <span class="light-bulb" role="img" aria-label="Light Bulb">💡</span>
173
  How does ClimateQ&A work?
174
  </div>
175
- ClimateQ&A harnesses modern OCR techniques to parse and preprocess IPCC reports. By leveraging state-of-the-art question-answering algorithms, <i>ClimateQ&A is able to sift through the extensive collection of climate scientific reports and identify relevant passages in response to user inquiries</i>. Furthermore, the integration of the ChatGPT API allows ClimateQ&A to present complex data in a user-friendly manner, summarizing key points and facilitating communication of climate science to a wider audience. This tool effectively puts a climate expert in your pocket.
176
  </div>
177
 
 
 
 
 
 
178
  """
179
  )
180
 
@@ -186,7 +221,7 @@ ClimateQ&A harnesses modern OCR techniques to parse and preprocess IPCC reports.
186
 
187
  with gr.Row():
188
  with gr.Column(scale=2):
189
- chatbot = gr.Chatbot(elem_id="chatbot")
190
  state = gr.State([system_template])
191
 
192
  with gr.Row():
@@ -252,7 +287,7 @@ ClimateQ&A harnesses modern OCR techniques to parse and preprocess IPCC reports.
252
  state,
253
  gr.inputs.Dropdown(
254
  ["IPCC only", "All available"],
255
- default="All available",
256
  label="Select reports",
257
  ),
258
  ],
@@ -379,7 +414,7 @@ If you have any questions or feature requests, please feel free to reach us out
379
  ## 💻 Developers
380
  For developers, the methodology used is detailed below :
381
 
382
- - Extract individual paragraphs from scientific reports (e.g., IPCC, IPBES) using OCR techniques and open sources algorithms
383
  - Use Haystack to compute semantically representative embeddings for each paragraph using a sentence transformers model (https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1). 
384
  - Store all the embeddings in a FAISS Flat index. 
385
  - Reformulate each user query to be as specific as possible and compute its embedding. 
 
13
  from datetime import datetime
14
  from azure.storage.fileshare import ShareServiceClient
15
 
16
+ from dotenv import load_dotenv
17
+
18
+ # Load the environment variables from the .env file
19
+ load_dotenv()
20
+ print(os.environ)
21
+
22
+ # for key in ["CONTENT","API_KEY","SOURCES","RESSOURCE_ENDPOINT"]:
23
+ # os.environ[key.lower()] = os.environ[key]
24
+
25
+
26
+ theme = gr.themes.Soft(
27
+ primary_hue="sky",
28
+ font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
29
+ )
30
+
31
 
32
  system_template = {"role": "system", "content": os.environ["content"]}
33
 
 
59
  "account_name": os.environ["account_name"],
60
  }
61
 
62
+ try:
63
+ account_url = os.environ["account_url"]
64
+ file_share_name = "climategpt"
65
+ service = ShareServiceClient(account_url=account_url, credential=credential)
66
+ share_client = service.get_share_client(file_share_name)
67
+ except:
68
+ print("Skipped logging")
69
+
70
  user_id = create_user_id(10)
71
 
72
 
 
74
  user_id: str,
75
  query: str,
76
  history: list = [system_template],
77
+ report_type: str = "IPCC only",
78
  threshold: float = 0.555,
79
  ) -> tuple:
80
  """retrieve relevant documents in the document store then query gpt-turbo
 
100
 
101
  messages = history + [{"role": "user", "content": query}]
102
  sources = "\n\n".join(
103
+ f"📃 doc {i}: {d.meta['file_name']} page {d.meta['page_number']}\n{d.content}"
104
  for i, d in enumerate(docs, 1)
105
  if d.score > threshold
106
  )
 
110
  {"role": "system", "content": f"{os.environ['sources']}\n\n{sources}"}
111
  )
112
 
113
+ response = openai.Completion.create(
114
+ engine="climateGPT",
115
+ prompt=to_completion(messages),
116
+ temperature=0.2,
117
+ stream=True,
118
+ max_tokens=1024,
119
+ )
120
 
 
121
  complete_response = ""
122
  messages.pop()
123
+
124
+ messages.append({"role": "assistant", "content": complete_response})
125
+ timestamp = str(datetime.now().timestamp())
126
+ file = user_id[0] + timestamp + ".json"
127
+ logs = {
128
+ "user_id": user_id[0],
129
+ "prompt": query,
130
+ "retrived": sources,
131
+ "report_type": report_type,
132
+ "prompt_eng": messages[0],
133
+ "answer": messages[-1]["content"],
134
+ "time": timestamp,
135
+ }
136
+ try:
137
+ log_on_azure(file, logs, share_client)
138
+ except:
139
+ pass
140
+
141
+
142
+ for chunk in response:
143
+ if (
144
+ chunk_message := chunk["choices"][0].get("text")
145
+ ) and chunk_message != "<|im_end|>":
146
+ complete_response += chunk_message
147
+ messages[-1]["content"] = complete_response
148
+ gradio_format = make_pairs([a["content"] for a in messages[1:]])
149
+ yield gradio_format, messages, sources
150
+
151
+
152
  else:
153
+ sources = "⚠️ No relevant passages found in the climate science reports"
154
+ complete_response = "**⚠️ No relevant passages found in the climate science reports, you may want to ask a more specific question (specifying your question on climate issues).**"
155
+
156
+ messages.append({"role": "assistant", "content": complete_response})
157
+
158
+ gradio_format = make_pairs([a["content"] for a in messages[1:]])
159
+ yield gradio_format, messages, sources
160
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
 
163
  def save_feedback(feed: str, user_id):
 
182
  file_client.upload_file(str(logs))
183
 
184
 
185
+ with gr.Blocks(title="🌍 Climate Q&A", css="style.css",theme = theme) as demo:
186
 
187
  user_id_state = gr.State([user_id])
188
 
 
196
  gr.Markdown(
197
  """
198
  <p><b>Climate change and environmental disruptions have become some of the most pressing challenges facing our planet today</b>. As global temperatures rise and ecosystems suffer, it is essential for individuals to understand the gravity of the situation in order to make informed decisions and advocate for appropriate policy changes.</p>
199
+ <p>However, comprehending the vast and complex scientific information can be daunting, as the scientific consensus references, such as <b>the Intergovernmental Panel on Climate Change (IPCC) reports, span thousands of pages</b>. To bridge this gap and make climate science more accessible, we introduce <b>ClimateQ&A as a tool to distill expert-level knowledge into easily digestible insights about climate science.</b></p>
200
  <div class="tip-box">
201
  <div class="tip-box-title">
202
  <span class="light-bulb" role="img" aria-label="Light Bulb">💡</span>
203
  How does ClimateQ&A work?
204
  </div>
205
+ ClimateQ&A harnesses modern OCR techniques to parse and preprocess IPCC reports. By leveraging state-of-the-art question-answering algorithms, <i>ClimateQ&A is able to sift through the extensive collection of climate scientific reports and identify relevant passages in response to user inquiries</i>. Furthermore, the integration of the ChatGPT API allows ClimateQ&A to present complex data in a user-friendly manner, summarizing key points and facilitating communication of climate science to a wider audience.
206
  </div>
207
 
208
+ <div class="warning-box">
209
+ Version 0.2-beta - This tool is under active development
210
+ </div>
211
+
212
+
213
  """
214
  )
215
 
 
221
 
222
  with gr.Row():
223
  with gr.Column(scale=2):
224
+ chatbot = gr.Chatbot(elem_id="chatbot",label = "ClimateQ&A chatbot")
225
  state = gr.State([system_template])
226
 
227
  with gr.Row():
 
287
  state,
288
  gr.inputs.Dropdown(
289
  ["IPCC only", "All available"],
290
+ default="IPCC only",
291
  label="Select reports",
292
  ),
293
  ],
 
414
  ## 💻 Developers
415
  For developers, the methodology used is detailed below :
416
 
417
+ - Extract individual paragraphs from scientific reports (e.g., IPCC, IPBES) using OCR techniques and open sources algorithms
418
  - Use Haystack to compute semantically representative embeddings for each paragraph using a sentence transformers model (https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1). 
419
  - Store all the embeddings in a FAISS Flat index. 
420
  - Reformulate each user query to be as specific as possible and compute its embedding. 
requirements.txt CHANGED
@@ -2,4 +2,5 @@ faiss-cpu==1.7.2
2
  farm-haystack==1.14.0
3
  gradio==3.22.1
4
  openai==0.27.0
5
- azure-storage-file-share==12.11.1
 
 
2
  farm-haystack==1.14.0
3
  gradio==3.22.1
4
  openai==0.27.0
5
+ azure-storage-file-share==12.11.1
6
+ python-dotenv==1.0.0
style.css CHANGED
@@ -11,8 +11,8 @@
11
 
12
 
13
  .tip-box {
14
- background-color: #e0f7fa;
15
- border: 1px solid #80deea;
16
  border-radius: 4px;
17
  margin-top:20px;
18
  padding: 15px 20px;
@@ -40,11 +40,6 @@
40
  display:none;
41
  }
42
 
43
- .message.user{
44
- border-color:#53bcd4 !important;
45
- background-color: #daf1f6 !important;
46
- }
47
-
48
  .message{
49
  font-size:14px !important;
50
  }
 
11
 
12
 
13
  .tip-box {
14
+ background-color: #f0f9ff;
15
+ border: 1px solid #80d4fa;
16
  border-radius: 4px;
17
  margin-top:20px;
18
  padding: 15px 20px;
 
40
  display:none;
41
  }
42
 
 
 
 
 
 
43
  .message{
44
  font-size:14px !important;
45
  }