momenaca commited on
Commit
45c901d
·
1 Parent(s): 1478649

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/app.cpython-38.pyc
3
+ __pycache__/app.cpython-39.pyc
4
+ __pycache__/utils.cpython-38.pyc
5
+
6
+ notebooks/
7
+ *.pyc
8
+ local_tests/
README.md CHANGED
@@ -7,6 +7,9 @@ sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
  ---
12
 
13
+ # Spinoza Project
14
+
15
+ WIP
app.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import yaml
3
+ import logging
4
+ import gradio as gr
5
+ from langchain.prompts.chat import ChatPromptTemplate
6
+ from huggingface_hub import hf_hub_download, whoami
7
+ from app.source.backend.llm_utils import get_llm
8
+ from app.source.backend.document_store import pickle_to_document_store
9
+ from app.source.backend.get_prompts import get_qa_prompts
10
+ from app.source.frontend.utils import (
11
+ make_html_source,
12
+ make_html_presse_source,
13
+ init_env,
14
+ )
15
+ from app.source.backend.prompt_utils import to_chat_instruction, SpecialTokens
16
+
17
+ init_env()
18
+
19
+ with open("./app/config.yaml") as f:
20
+ config = yaml.full_load(f)
21
+
22
+ prompts = {}
23
+ for source in config["prompt_naming"]:
24
+ with open(f"./app/prompt_{source}.yaml") as f:
25
+ prompts[source] = yaml.full_load(f)
26
+
27
+ ## Building LLM
28
+ print("Building LLM")
29
+ model = "gpt35turbo"
30
+ llm = get_llm()
31
+
32
+ ## Loading_tools
33
+ print("Loading Databases")
34
+ qdrants = {
35
+ tab: pickle_to_document_store(
36
+ hf_hub_download(
37
+ repo_id="momenaca/spinoza-dbs",
38
+ filename=f"database_{tab}.pickle",
39
+ token=True,
40
+ repo_type="dataset",
41
+ )
42
+ )
43
+ for tab in config["prompt_naming"]
44
+ }
45
+
46
+ ## Load Prompts
47
+ print("Loading Prompts")
48
+ chat_qa_prompts, chat_reformulation_prompts, chat_summarize_memory_prompts = {}, {}, {}
49
+ for source, prompt in prompts.items():
50
+ chat_qa_prompt, chat_reformulation_prompt = get_qa_prompts(config, prompt)
51
+ chat_qa_prompts[source] = chat_qa_prompt
52
+ chat_reformulation_prompts[source] = chat_reformulation_prompt
53
+ # chat_summarize_memory_prompts[source] = chat_summarize_memory_prompt
54
+
55
+ with open("./assets/style.css", "r") as f:
56
+ css = f.read()
57
+
58
+
59
+ def update_tabs(outil, visible_tabs):
60
+ visible_tabs = outil
61
+ return visible_tabs
62
+
63
+
64
+ special_tokens = SpecialTokens(config)
65
+
66
+ synthesis_template = """You are a factual journalist that summarize the secialized awnsers from thechnical sources.
67
+
68
+ Based on the folowing question:
69
+ {question}
70
+
71
+ And the following expert answer:
72
+ {answers}
73
+
74
+ Answer the question, in French.
75
+ When using legal awnsers, keep tracking of the name of the articles.
76
+ When using ADEME awnsers, name the sources that are mainly used.
77
+ List the different element mentionned, and highlight the agreement points between the sources, as well as the contradictions or differences.
78
+ Generate the answer as markdown, with an aerated layout, and headlines in bold
79
+ Start by a general summary, agreement and contracdiction, and then go into detail without paraphasing the experts awnsers.
80
+ """
81
+
82
+ synthesis_prompt = to_chat_instruction(synthesis_template, special_tokens)
83
+ synthesis_prompt_template = ChatPromptTemplate.from_messages([synthesis_prompt])
84
+
85
+
86
+ def zip_longest_fill(*args, fillvalue=None):
87
+ # zip_longest('ABCD', 'xy', fillvalue='-') --> Ax By C- D-
88
+ iterators = [iter(it) for it in args]
89
+ num_active = len(iterators)
90
+ if not num_active:
91
+ return
92
+
93
+ cond = True
94
+ fillvalues = [None] * len(iterators)
95
+ while cond:
96
+ values = []
97
+ for i, it in enumerate(iterators):
98
+ try:
99
+ value = next(it)
100
+ except StopIteration:
101
+ value = fillvalues[i]
102
+ values.append(value)
103
+
104
+ new_cond = False
105
+ for i, elt in enumerate(values):
106
+ if elt != fillvalues[i]:
107
+ new_cond = True
108
+ cond = new_cond
109
+
110
+ fillvalues = values.copy()
111
+ yield tuple(values)
112
+
113
+
114
+ def build_data_dict(config):
115
+ data_dict = {}
116
+ for tab in config["tabs"]:
117
+ data_dict[tab] = {
118
+ "tab": {
119
+ "init_value": tab,
120
+ "component": None,
121
+ "elem_id": "tab",
122
+ },
123
+ "description": {
124
+ "init_value": config["tabs"][tab],
125
+ "component": None,
126
+ "elem_id": "desc",
127
+ },
128
+ "question": {
129
+ "init_value": None,
130
+ "component": None,
131
+ "elem_id": "question",
132
+ },
133
+ "answer": {
134
+ "init_value": None,
135
+ "component": None,
136
+ "elem_id": "answer",
137
+ },
138
+ "sources": {
139
+ "init_value": None,
140
+ "component": None,
141
+ "elem_id": "src",
142
+ },
143
+ }
144
+ return data_dict
145
+
146
+
147
+ def init_gradio(data, config=config):
148
+ for t in data:
149
+ data[t]["tab"]["component"] = gr.Tab(
150
+ data[t]["tab"]["init_value"], elem_id="tab"
151
+ )
152
+ with data[t]["tab"]["component"]:
153
+ for fields in data[t]:
154
+ if fields == "question":
155
+ data[t][fields]["component"] = gr.Textbox(
156
+ elem_id=data[t][fields]["elem_id"],
157
+ show_label=False,
158
+ interactive=True,
159
+ placeholder="",
160
+ )
161
+ # elif fields == "answer":
162
+ # data[t][fields]["component"] = gr.Textbox(
163
+ # elem_id=data[t][fields]["elem_id"],
164
+ # show_label=True,
165
+ # interactive=True,
166
+ # placeholder="",
167
+ # show_copy_button=True
168
+ # )
169
+ elif fields != "tab":
170
+ data[t][fields]["component"] = gr.Markdown(
171
+ data[t][fields]["init_value"],
172
+ elem_id=data[t][fields]["elem_id"],
173
+ )
174
+ # data[t][fields]["component"] = gr.Textbox(
175
+ # value=data[t][fields]["init_value"],
176
+ # elem_id=data[t][fields]["elem_id"],
177
+ # show_label=True,
178
+ # interactive=False,
179
+ # show_copy_button=True,
180
+ # )
181
+ return data
182
+
183
+
184
+ def add_warning():
185
+ return "*Les éléments cochés ont commencé à être généré dans les onglets spécifiques, la synthèse ne sera disponible qu'après la mise à disposition de ces derniers.*"
186
+
187
+
188
+ def format_question(question):
189
+ return f"{question}" # ###
190
+
191
+
192
+ def parse_question(question):
193
+ x = question.replace("<p>", "").replace("</p>\n", "")
194
+ if "### " in x:
195
+ return x.split("### ")[1]
196
+ return x
197
+
198
+
199
+ def reformulate(outils, question, tab, config=config):
200
+ if tab in outils:
201
+ return llm.stream(
202
+ chat_reformulation_prompts[config["source_mapping"][tab]],
203
+ {"question": parse_question(question)},
204
+ )
205
+ else:
206
+ return iter([None] * 5)
207
+
208
+
209
+ def reformulate_single_question(outils, question, tab, config=config):
210
+ for elt in reformulate(outils, question, tab, config=config):
211
+ time.sleep(0.02)
212
+ yield elt
213
+
214
+
215
+ def reformulate_questions(outils, question, config=config):
216
+ for elt in zip_longest_fill(
217
+ *[reformulate(outils, question, tab, config=config) for tab in config["tabs"]]
218
+ ):
219
+ time.sleep(0.02)
220
+ yield elt
221
+
222
+
223
+ def add_question(question):
224
+ return question
225
+
226
+
227
+ def answer(question, source, outils, tab, config=config):
228
+ if tab in outils:
229
+ if len(source) < 10:
230
+ return iter(["Aucune source trouvée, veuillez reformuler votre question"])
231
+ else:
232
+
233
+ return llm.stream(
234
+ chat_qa_prompts[config["source_mapping"][tab]],
235
+ {
236
+ "question": parse_question(question),
237
+ "sources": source.replace("<p>", "").replace("</p>\n", ""),
238
+ },
239
+ )
240
+ else:
241
+ return iter([None] * 5)
242
+
243
+
244
+ def answer_single_question(outils, source, question, tab, config=config):
245
+ for elt in answer(question, source, outils, tab, config=config):
246
+ time.sleep(0.02)
247
+ yield elt
248
+
249
+
250
+ def answer_questions(outils, *questions_sources, config=config):
251
+
252
+ questions = [elt for elt in questions_sources[: len(questions_sources) // 2]]
253
+ sources = [elt for elt in questions_sources[len(questions_sources) // 2 :]]
254
+
255
+ for elt in zip_longest_fill(
256
+ *[
257
+ answer(question, source, outils, tab, config=config)
258
+ for question, source, tab in zip(questions, sources, config["tabs"])
259
+ ]
260
+ ):
261
+ time.sleep(0.02)
262
+ yield elt
263
+
264
+
265
+ def get_source_link(metadata):
266
+ return metadata["file_url"] + f"#page={metadata['content_page_number'] + 1}"
267
+
268
+
269
+ def get_button(i, tag):
270
+ return f"""<button id="btn_{tag}_{i}" type="button" style="margin: 0; display: inline; align="right">[{i}]</button>"""
271
+
272
+
273
+ def get_html_sources(buttons, cards):
274
+ return f"""
275
+ <p style="margin: 0; display: inline;"><strong><br>Sources utilisées : </strong></p>
276
+ {buttons}
277
+ {cards}
278
+ """
279
+
280
+
281
+ def get_sources(outils, question, tab, qdrants=qdrants, config=config):
282
+ k = config["num_document_retrieved"]
283
+ min_similarity = config["min_similarity"]
284
+ if tab in outils:
285
+ sources = qdrants[
286
+ config["source_mapping"][tab]
287
+ ].similarity_search_with_relevance_scores(
288
+ config["query_preprompt"]
289
+ + question.replace("<p>", "").replace("</p>\n", ""),
290
+ k=k,
291
+ # filter=get_qdrant_filters(filters),
292
+ )
293
+ sources = [(doc, score) for doc, score in sources if score >= min_similarity]
294
+
295
+ buttons_ids = list(range(len(sources)))
296
+ buttons = " ".join(
297
+ [get_button(i, tab) for i, source in zip(buttons_ids, sources)]
298
+ )
299
+ formated = (
300
+ "\n\n".join(
301
+ [
302
+ make_html_presse_source(source[0], i, tab, source[1], config)
303
+ for i, source in zip(buttons_ids, sources)
304
+ ]
305
+ )
306
+ if tab == "Presse"
307
+ else "\n\n".join(
308
+ [
309
+ make_html_source(source[0], i, tab, source[1], config)
310
+ for i, source in zip(buttons_ids, sources)
311
+ ]
312
+ )
313
+ )
314
+ formated = get_html_sources(buttons, formated) if sources else ""
315
+ text = "\n\n".join(
316
+ [
317
+ f"Doc {str(i)} with source type {elt[0].metadata.get('file_source_type')}:\n"
318
+ + elt[0].page_content
319
+ for i, elt in enumerate(sources)
320
+ ]
321
+ )
322
+ return str(formated), str(text) # formated_sources, text_sources
323
+ else:
324
+ return "", ""
325
+
326
+
327
+ def retrieve_sources(outils, *questions, qdrants=qdrants, config=config):
328
+ results = [
329
+ get_sources(outils, question, tab, qdrants, config)
330
+ for question, tab in zip(questions, config["tabs"])
331
+ ]
332
+ formated_sources = [source[0] for source in results]
333
+ text_sources = [source[1] for source in results]
334
+ return tuple(formated_sources + text_sources)
335
+
336
+
337
+ def get_experts(outils, *answers, config=config):
338
+ return "\n\n".join(
339
+ [
340
+ f"{tab}\n{answers[i]}"
341
+ for i, tab in enumerate(config["tabs"])
342
+ if (tab in outils)
343
+ ]
344
+ )
345
+
346
+
347
+ def get_synthesis(outils, question, *answers, config=config):
348
+ answer = []
349
+ for i, tab in enumerate(config["tabs"]):
350
+ if (tab in outils) & (len(str(answers[i])) >= 100):
351
+ answer.append(
352
+ f"{tab}\n{answers[i]}".replace("<p>", "").replace("</p>\n", "")
353
+ )
354
+
355
+ if len(answer) == 0:
356
+ return "Aucune source n'a pu être identifiée pour répondre, veuillez modifier votre question"
357
+ else:
358
+ for elt in llm.stream(
359
+ synthesis_prompt_template,
360
+ {
361
+ "question": question.replace("<p>", "").replace("</p>\n", ""),
362
+ "answers": "\n\n".join(answer),
363
+ },
364
+ ):
365
+ time.sleep(0.01)
366
+ yield elt
367
+
368
+
369
+ def get_listener():
370
+ return """
371
+ function my_func_body() {
372
+ const body = document.querySelector("body");
373
+ body.addEventListener("click", e => {
374
+ console.log(e)
375
+ const sourceId = "btn_" + e.target.id.split("_")[1] + "_" + e.target.id.split("_")[2] + "_source"
376
+ console.log(sourceId)
377
+ if (document.getElementById(sourceId).style.display === "none") {
378
+ document.getElementById(sourceId).style.display = "";
379
+ } else {
380
+ document.getElementById(sourceId).style.display = "none";
381
+ }
382
+ }
383
+ )}
384
+ """
385
+
386
+
387
+ def get_source_template(buttons, divs_source):
388
+ return """
389
+ <div class="source">
390
+ <p style="margin: 0; display: inline;"><strong><br>Sources utilisées :</strong></p>
391
+ {buttons}
392
+ {divs_source}
393
+ </div>
394
+ </div>
395
+ """
396
+
397
+
398
+ def activate_questions(outils, *textboxes, config=config):
399
+ activated_textboxes = []
400
+ for i, tab in enumerate(config["tabs"]):
401
+ if tab in outils:
402
+ activated_textboxes.append(
403
+ gr.Textbox(
404
+ show_label=False,
405
+ interactive=True,
406
+ placeholder="Sélectionnez cet outil et posez une question sur l'onglet de synthèse",
407
+ )
408
+ )
409
+
410
+ else:
411
+ activated_textboxes.append(
412
+ gr.Textbox(
413
+ show_label=False,
414
+ interactive=False,
415
+ placeholder="Sélectionnez cet outil et posez une question sur l'onglet de synthèse",
416
+ )
417
+ )
418
+ return activated_textboxes
419
+
420
+
421
+ def empty():
422
+ return ""
423
+
424
+
425
+ def empty_none():
426
+ return None
427
+
428
+
429
+ theme = gr.themes.Soft(
430
+ primary_hue="sky",
431
+ font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
432
+ )
433
+
434
+ logo_rsf = config["logo_rsf"]
435
+ logo_ap = config["logo_ap"]
436
+
437
+ data = build_data_dict(config)
438
+
439
+
440
+ def update_visible(oauth_token: gr.OAuthToken | None):
441
+ if oauth_token is None:
442
+ return {
443
+ bloc_1: gr.update(visible=True),
444
+ bloc_2: gr.update(visible=False),
445
+ bloc_3: gr.update(visible=False),
446
+ }
447
+
448
+ org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
449
+
450
+ if "SpinozaProject" in org_names: # logged in group
451
+ return {
452
+ bloc_1: gr.update(visible=False),
453
+ bloc_2: gr.update(visible=True),
454
+ bloc_3: gr.update(visible=False),
455
+ }
456
+
457
+ else: # logged but not in group
458
+ return {
459
+ bloc_1: gr.update(visible=False),
460
+ bloc_2: gr.update(visible=False),
461
+ bloc_3: gr.update(visible=True),
462
+ }
463
+
464
+
465
+ with gr.Blocks(
466
+ title=f"🔍{config['demo_name']}",
467
+ css=css,
468
+ js=get_listener(),
469
+ theme=theme,
470
+ ) as demo:
471
+ gr.LoginButton()
472
+
473
+ with gr.Column() as bloc_1:
474
+ textbox_1 = gr.Textbox("You are not logged to Hugging Face !", show_label=False)
475
+
476
+ with gr.Column(visible=False) as bloc_3:
477
+ textbox_3 = gr.Textbox(
478
+ "You are not part of the Spinoza Project, ask access here : https://huggingface.co/organizations/TestSpinoza/share/kmwhyFXasNnGfkBrKzNAPgnlRrxyVOSSMx"
479
+ )
480
+
481
+ with gr.Column(visible=False) as bloc_2:
482
+ gr.HTML(
483
+ f"""<div class="row_logo">
484
+ <img src={logo_rsf} alt="logo RSF" style="float:left; width:120px; height:70px">
485
+ <img src={logo_ap} alt="logo AP" style="width:120px; height:70px">
486
+ </div>"""
487
+ )
488
+
489
+ text_sources = {elt: gr.State("") for elt in config["tabs"]}
490
+ tab_states = {elt: gr.State(elt) for elt in config["tabs"]}
491
+ with gr.Row():
492
+ with gr.Column(scale=3):
493
+ outils = gr.CheckboxGroup(
494
+ choices=list(config["tabs"].keys()),
495
+ value=list(config["tabs"].keys()),
496
+ type="value",
497
+ label="Choisir les bases de données à interroger",
498
+ )
499
+ with gr.Column(scale=1):
500
+ submit_btn = gr.Button(
501
+ "Relancer la Synthèse", variant="primary", elem_id="synthese_btn"
502
+ )
503
+
504
+ # Synthesis tab
505
+ synthesis_tab = gr.Tab("Synthesis", elem_id="tab")
506
+ with synthesis_tab:
507
+ question = gr.Textbox(
508
+ show_label=True,
509
+ label="Posez une question à Spinoza",
510
+ placeholder="Quelle est votre question ?",
511
+ )
512
+ md_question = gr.Markdown(None, visible=False)
513
+ warning = gr.Markdown(None, elem_id="warn")
514
+ synthesis = gr.Markdown(None, elem_id="synthesis")
515
+
516
+ data = init_gradio(data)
517
+ (
518
+ question.submit(add_question, [question], [md_question])
519
+ .then(add_warning, [], [warning])
520
+ .then(empty, [], [synthesis])
521
+ .then(
522
+ reformulate_questions,
523
+ [outils, md_question],
524
+ [data[tab]["question"]["component"] for tab in config["tabs"]],
525
+ )
526
+ .then(
527
+ retrieve_sources,
528
+ [outils]
529
+ + [data[tab]["question"]["component"] for tab in config["tabs"]],
530
+ [data[tab]["sources"]["component"] for tab in config["tabs"]]
531
+ + [text_sources[tab] for tab in config["tabs"]],
532
+ )
533
+ .then(
534
+ answer_questions,
535
+ [outils]
536
+ + [data[tab]["question"]["component"] for tab in config["tabs"]]
537
+ + [text_sources[tab] for tab in config["tabs"]],
538
+ [data[tab]["answer"]["component"] for tab in config["tabs"]],
539
+ )
540
+ .then(
541
+ get_synthesis,
542
+ [outils, md_question]
543
+ + [data[tab]["answer"]["component"] for tab in config["tabs"]],
544
+ [synthesis],
545
+ )
546
+ )
547
+
548
+ for tab in config["tabs"]:
549
+ (
550
+ data[tab]["question"]["component"]
551
+ .submit(empty, [], [data[tab]["sources"]["component"]])
552
+ .then(empty, [], [text_sources[tab]])
553
+ .then(empty, [], [data[tab]["answer"]["component"]])
554
+ .then(
555
+ get_sources,
556
+ [outils, data[tab]["question"]["component"], tab_states[tab]],
557
+ [data[tab]["sources"]["component"], text_sources[tab]],
558
+ )
559
+ .then(
560
+ answer_single_question,
561
+ [
562
+ outils,
563
+ text_sources[tab],
564
+ data[tab]["question"]["component"],
565
+ tab_states[tab],
566
+ ],
567
+ [data[tab]["answer"]["component"]],
568
+ )
569
+ )
570
+
571
+ (
572
+ submit_btn.click(empty, [], [synthesis]).then(
573
+ get_synthesis,
574
+ [outils, md_question]
575
+ + [data[tab]["answer"]["component"] for tab in config["tabs"]],
576
+ [synthesis],
577
+ )
578
+ )
579
+ demo.load(update_visible, inputs=None, outputs=[bloc_1, bloc_2, bloc_3])
580
+
581
+ if __name__ == "__main__":
582
+ demo.queue().launch(share=True, debug=True)
app/__init__.py ADDED
File without changes
app/config.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ demo_name: Spinoza Q&A
2
+ tabs:
3
+ GIEC et IPBES: "*Outil dédié aux rapports du GIEC et de l'IPBES.*"
4
+ Textes Juridiques: "*Outil dédié aux codes Français modifiés par la loi climat (21/73).*"
5
+ Documents Stratégiques: "*Outil dédié aux données centrées sur le plan politique (SNBC).*"
6
+ ADEME: "*Outil dédié aux données issues de l'ADEME et nous avons sélectionnés notamment différentes catégories de rapports:*\n
7
+ * *Les guides mis à disposition de la population*\n
8
+ * *Les rapport d'expériences sur des nouvelles technologies*\n
9
+ * *Des études et recherches sur des impacts locaux*\n
10
+ * *Des documents institutionnels (analyses demandées par la France & rapports d'activité)*\n
11
+ * *Les plans de transition sectoriels pour les secteurs industriels les plus émetteurs : (verre, papier, ciment, acier, aluminium, chimie, sucre)*"
12
+ # Presse: "*Outil dédié aux données fournies par Aday concernant la presse.*"
13
+
14
+ logo_rsf:
15
+ ""
16
+
17
+ logo_ap:
18
+ ""
19
+
20
+ source_mapping:
21
+ GIEC et IPBES: "Science"
22
+ Textes Juridiques: "Loi"
23
+ Documents Stratégiques: "Politique"
24
+ ADEME: "ADEME"
25
+ # Presse: "Presse"
26
+
27
+ prompt_naming:
28
+ Science: "Science"
29
+ Loi: "Loi"
30
+ Politique: "Politique"
31
+ ADEME: "ADEME"
32
+ # Presse: "Presse"
33
+
34
+
35
+ database_index_path: './app/data/database_tab_placeholder.pickle'
36
+ query_preprompt: 'query: '
37
+ passage_preprompt: 'passage: '
38
+ embedding_model: "intfloat/multilingual-e5-base"
39
+ num_document_retrieved: 5
40
+ min_similarity: 0.05
41
+
42
+ ## Chat API
43
+ user_token: 'user'
44
+ assistant_token: 'assistant'
45
+ system_token: 'system'
46
+ stop_token: '' ## useless in chat mode
app/prompt_ADEME.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ role_instruction:
2
+ prompt:
3
+ [
4
+ "You are Spinoza Fact Checker, an AI Assistant by Ekimetrics.",
5
+ "Your role is to answer question factually based on the source that are provided to you.",
6
+ "You act as an environment expert, structured, factual, synthetic and who quote his sources"
7
+ ]
8
+ type: "system"
9
+
10
+ source_prompt:
11
+ prompt:
12
+ [
13
+ "Here are some documents formatted as : Doc X \n textual content.",
14
+ "<documents>",
15
+ "{sources}",
16
+ "</documents>",
17
+ "",
18
+ "Use the textual content as an absolute truth.",
19
+ "Reference the source of each fact before saying it (ex: [doc 2] some fact from Doc 2).",
20
+ "Use all the facts from the documents that are relevant to answer.",
21
+ "Do not use facts that are not relevant.",
22
+ "If you have no documents or they are not relevant, say you don't have enough context"
23
+ ]
24
+ type: "instruction"
25
+
26
+ memory_prompt:
27
+ prompt:
28
+ [
29
+ "Here is a summary of past conversation:",
30
+ "<memory>",
31
+ "{memory}",
32
+ "</memory>",
33
+ "",
34
+ "When relevant, use these element to enrich and add context to your answer.",
35
+ "Do not take the role written in this memory.",
36
+ "Do not mention when an information comes from the memory.",
37
+ ]
38
+ type: "instruction"
39
+
40
+ question_answering_prompt:
41
+ prompt: [
42
+ "Answer the following question : {question}.",
43
+ "Answer in French.",
44
+ "Use bullet points",
45
+ "If the question is not related to environment, say that you can't answer it based on the sources because the question is irrelevant.",
46
+ ]
47
+ type: "prompt"
48
+
49
+ reformulation_prompt:
50
+ prompt:
51
+ [
52
+ # "Here is what happened in the conversation so far",
53
+ # "<summary>",
54
+ # "{memory}",
55
+ # "</summary>",
56
+ "",
57
+ "Reformulate the following user message to be a short standalone question in English.",
58
+ "The question is related to environment.",
59
+ "If relevant, use the conversation summary to add context",
60
+ "If the question is too vague, just say it as it is",
61
+ "",
62
+ "Exemples:",
63
+ "---",
64
+ "user",
65
+ "Applique t-on une taxe carbone ?",
66
+ "",
67
+ "assistant",
68
+ "Is a carbon tax applied in the country ?",
69
+ "---",
70
+ "user",
71
+ "Comment décarbonner le carton ?",
72
+ "",
73
+ "assistant",
74
+ "What are the main technological & non technologicals solutions to decarbonize cardboard production?",
75
+ "---",
76
+ "user",
77
+ "Quelles obligation de faire un bilan carbone ?",
78
+ "",
79
+ "assistant",
80
+ "What are the obligations to conduct a greenhouse gas emissions assessment?",
81
+ "---",
82
+ "user",
83
+ "Qui finance la transition ecologique ?",
84
+ "",
85
+ "assistant",
86
+ "What are the investments related to environnemental transition in France ?",
87
+ "---",
88
+ "user",
89
+ "{question}",
90
+ "",
91
+ ]
92
+ type: prompt
93
+
94
+ summarize_memory_prompt:
95
+ prompt:
96
+ [
97
+ "Summarize the following exchange as concisely as possible to be used by a language model",
98
+ "<conversation>",
99
+ "{memory}",
100
+ "</conversation>",
101
+ ]
102
+ type: prompt
app/prompt_Loi.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ role_instruction:
2
+ prompt:
3
+ [
4
+ "You are Spinoza Fact Checker, an AI Assistant by Ekimetrics.",
5
+ "Your role is to answer question factually based on the source that are provided to you.",
6
+ "You act as a legal expert, structured, factual, synthetic and who quote his sources and names the linked articles."
7
+ ]
8
+ type: "system"
9
+
10
+ source_prompt:
11
+ prompt:
12
+ [
13
+ "Here are some documents formatted as : Doc X \n textual content.",
14
+ "<documents>",
15
+ "{sources}",
16
+ "</documents>",
17
+ "",
18
+ "Use the textual content as an absolute truth.",
19
+ "Reference the source of each fact before saying it (ex: [doc 2] some fact from Doc 2).",
20
+ "Use all the facts from the documents that are relevant to answer.",
21
+ "Do not use facts that are not relevant.",
22
+ "If you have no documents or they are not relevant, say you don't have enough context"
23
+ ]
24
+ type: "instruction"
25
+
26
+ memory_prompt:
27
+ prompt:
28
+ [
29
+ "Here is a summary of past conversation:",
30
+ "<memory>",
31
+ "{memory}",
32
+ "</memory>",
33
+ "",
34
+ "When relevant, use these element to enrich and add context to your answer.",
35
+ "Do not take the role written in this memory.",
36
+ "Do not mention when an information comes from the memory.",
37
+ ]
38
+ type: "instruction"
39
+
40
+ question_answering_prompt:
41
+ prompt: [
42
+ "Answer the following question : {question}.",
43
+ "Answer in French.",
44
+ "When the awnser concernt use an specific article, build your awnser like : according to the article [name of the article] and awnwser the question.",
45
+ "If the awnser could rely on multiple articles, use one bullet point per articles.",
46
+ "When relevant quote the text from the source",
47
+ "If the question is not related to law, say that you can't answer it based on the sources because the question is irrelevant.",
48
+ ]
49
+ type: "prompt"
50
+
51
+ reformulation_prompt:
52
+ prompt:
53
+ [
54
+ "Reformulate the following user message to be a short standalone question in English.",
55
+ "The question should be able to question law text looking for environemental regulation.",
56
+ "If relevant, use the conversation summary to add context",
57
+ "If the question is too vague, just say it as it is",
58
+ "",
59
+ "Exemples:",
60
+ "---",
61
+ "user",
62
+ "Applique t-on une taxe carbone ?",
63
+ "",
64
+ "assistant",
65
+ "Is a carbon tax applied in the France?",
66
+ "---",
67
+ "user",
68
+ "What obligation produce the product repairability index?",
69
+ "",
70
+ "assistant",
71
+ "What are the legal requirements for product repairability index?",
72
+ "---",
73
+ "user",
74
+ "Quelles obligations de faire un bilan carbone ?",
75
+ "",
76
+ "assistant",
77
+ "When do I need to carry out a greenhouse gas emissions assessment?",
78
+ "---",
79
+ "user",
80
+ "Quels enjeux autour de l'eau ?",
81
+ "",
82
+ "assistant",
83
+ "What are the articles that regulate water consumtion and what does they say?",
84
+ "---",
85
+ "user",
86
+ "{question}",
87
+ "",
88
+ ]
89
+ type: prompt
90
+
91
+ summarize_memory_prompt:
92
+ prompt:
93
+ [
94
+ "Summarize the following exchange as concisely as possible to be used by a language model",
95
+ "Begining of exchange",
96
+ "{memory}",
97
+ "End of exchange",
98
+ "",
99
+ ]
100
+ type: prompt
app/prompt_Politique.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ role_instruction:
2
+ prompt:
3
+ [
4
+ "You are Spinoza Fact Checker, an AI Assistant by Ekimetrics.",
5
+ "Your role is to answer question factually based on the source that are provided to you.",
6
+ "You act as a legal expert, structured, factual, synthetic and who quote his sources and names the linked articles."
7
+ ]
8
+ type: "system"
9
+
10
+ source_prompt:
11
+ prompt:
12
+ [
13
+ "Here are some documents formatted as : Doc X \n textual content.",
14
+ "<documents>",
15
+ "{sources}",
16
+ "</documents>",
17
+ "",
18
+ "Use the textual content as an absolute truth.",
19
+ "Reference the source of each fact before saying it (ex: [doc 2] some fact from Doc 2).",
20
+ "Use all the facts from the documents that are relevant to answer.",
21
+ "Do not use facts that are not relevant.",
22
+ "If you have no documents or they are not relevant, say you don't have enough context"
23
+ ]
24
+ type: "instruction"
25
+
26
+ memory_prompt:
27
+ prompt:
28
+ [
29
+ "Here is a summary of past conversation:",
30
+ "<memory>",
31
+ "{memory}",
32
+ "</memory>",
33
+ "",
34
+ "When relevant, use these element to enrich and add context to your answer.",
35
+ "Do not take the role written in this memory.",
36
+ "Do not mention when an information comes from the memory.",
37
+ ]
38
+ type: "instruction"
39
+
40
+ question_answering_prompt:
41
+ prompt: [
42
+ "Answer the following question : {question}.",
43
+ "Answer in French.",
44
+ "When the awnser concernt use an specific article, build your awnser like : according to the article [name of the article] and awnwser the question.",
45
+ "If the awnser could rely on multiple articles, use one bullet point per articles.",
46
+ "When relevant quote the text from the source",
47
+ "If the question is not related to law, say that you can't answer it based on the sources because the question is irrelevant.",
48
+ ]
49
+ type: "prompt"
50
+
51
+ reformulation_prompt:
52
+ prompt:
53
+ [
54
+ "Reformulate the following user message to be a short standalone question in English.",
55
+ "The question should be able to question law text looking for environemental regulation.",
56
+ "If relevant, use the conversation summary to add context",
57
+ "If the question is too vague, just say it as it is",
58
+ "",
59
+ "Exemples:",
60
+ "---",
61
+ "user",
62
+ "Applique t-on une taxe carbone ?",
63
+ "",
64
+ "assistant",
65
+ "Is a carbon tax applied in the France?",
66
+ "---",
67
+ "user",
68
+ "What obligation produce the product repairability index?",
69
+ "",
70
+ "assistant",
71
+ "What are the legal requirements for product repairability index?",
72
+ "---",
73
+ "user",
74
+ "Quelles obligations de faire un bilan carbone ?",
75
+ "",
76
+ "assistant",
77
+ "When do I need to carry out a greenhouse gas emissions assessment?",
78
+ "---",
79
+ "user",
80
+ "Quels enjeux autour de l'eau ?",
81
+ "",
82
+ "assistant",
83
+ "What are the articles that regulate water consumtion and what does they say?",
84
+ "---",
85
+ "user",
86
+ "{question}",
87
+ "",
88
+ ]
89
+ type: prompt
90
+
91
+ summarize_memory_prompt:
92
+ prompt:
93
+ [
94
+ "Summarize the following exchange as concisely as possible to be used by a language model",
95
+ "Begining of exchange",
96
+ "{memory}",
97
+ "End of exchange",
98
+ "",
99
+ ]
100
+ type: prompt
app/prompt_Presse.yaml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ role_instruction:
2
+ prompt:
3
+ [
4
+ "You are Spinoza Fact Checker, an AI Assistant by Ekimetrics.",
5
+ "Your role is to answer question factually based on the source that are provided to you.",
6
+ "All sources provided are comming from press releases and might not be considered as absolute truth",
7
+ "You act as an environment expert, structured, factual, synthetic and who quote his sources"
8
+ ]
9
+ type: "system"
10
+
11
+ source_prompt:
12
+ prompt:
13
+ [
14
+ "Here are some documents formatted as : Doc X \n textual content.",
15
+ "<documents>",
16
+ "{sources}",
17
+ "</documents>",
18
+ "",
19
+ "Use the textual content as an absolute truth.",
20
+ "Reference the source of each fact before saying it (ex: [doc 2] some fact from Doc 2).",
21
+ "Use all the facts from the documents that are relevant to answer.",
22
+ "Do not use facts that are not relevant.",
23
+ "If you have no documents or they are not relevant, say you don't have enough context"
24
+ ]
25
+ type: "instruction"
26
+
27
+ memory_prompt:
28
+ prompt:
29
+ [
30
+ "Here is a summary of past conversation:",
31
+ "<memory>",
32
+ "{memory}",
33
+ "</memory>",
34
+ "",
35
+ "When relevant, use these element to enrich and add context to your answer.",
36
+ "Do not take the role written in this memory.",
37
+ "Do not mention when an information comes from the memory.",
38
+ ]
39
+ type: "instruction"
40
+
41
+ question_answering_prompt:
42
+ prompt: [
43
+ "Answer the following question : {question}.",
44
+ "Answer in French.",
45
+ "Use bullet points",
46
+ "If the question is not related to environment, say that you can't answer it based on the sources because the question is irrelevant.",
47
+ ]
48
+ type: "prompt"
49
+
50
+ reformulation_prompt:
51
+ prompt:
52
+ [
53
+ # "Here is what happened in the conversation so far",
54
+ # "<summary>",
55
+ # "{memory}",
56
+ # "</summary>",
57
+ "",
58
+ "Reformulate the following user message to be a short standalone question in French.",
59
+ "The question is related to environment.",
60
+ "If relevant, use the conversation summary to add context",
61
+ "If the question is too vague, just say it as it is",
62
+ "",
63
+ "Exemples:",
64
+ "---",
65
+ "user",
66
+ "Applique t-on une taxe carbone ?",
67
+ "",
68
+ "assistant",
69
+ "Comment le sujet de la taxe carbone est il traité dans le corpus ?",
70
+ "---",
71
+ "user",
72
+ "Quelles obligation de faire un bilan carbone ?",
73
+ "",
74
+ "assistant",
75
+ "Quelles sont les obligation qui imposent de faire un bilan carbone",
76
+ "---",
77
+ "user",
78
+ "{question}",
79
+ "",
80
+ ]
81
+ type: prompt
82
+
83
+ summarize_memory_prompt:
84
+ prompt:
85
+ [
86
+ "Summarize the following exchange as concisely as possible to be used by a language model",
87
+ "<conversation>",
88
+ "{memory}",
89
+ "</conversation>",
90
+ ]
91
+ type: prompt
app/prompt_Science.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ role_instruction:
2
+ prompt:
3
+ [
4
+ "You are Spinoza Fact Checker, an AI Assistant by Ekimetrics.",
5
+ "Your role is to answer question factually based on the source that are provided to you.",
6
+ "You act as a scientific expert, structured, factual, synthetic and who quote his sources"
7
+ ]
8
+ type: "system"
9
+
10
+ source_prompt:
11
+ prompt:
12
+ [
13
+ "Here are some documents formatted as : Doc X \n textual content.",
14
+ "<documents>",
15
+ "{sources}",
16
+ "</documents>",
17
+ "",
18
+ "Use the textual content as an absolute truth.",
19
+ "Reference the source of each fact before saying it (ex: [doc 2] some fact from Doc 2).",
20
+ "Use all the facts from the documents that are relevant to answer.",
21
+ "Do not use facts that are not relevant.",
22
+ "If you have no documents : <documents>\n\n</documents> or they are not relevant, say you don't have enough context"
23
+ ]
24
+ type: "instruction"
25
+
26
+ memory_prompt:
27
+ prompt:
28
+ [
29
+ "Here is a summary of past conversation:",
30
+ "<memory>",
31
+ "{memory}",
32
+ "</memory>",
33
+ "",
34
+ "When relevant, use these element to enrich and add context to your answer.",
35
+ "Do not take the role written in this memory.",
36
+ "Do not mention when an information comes from the memory.",
37
+ ]
38
+ type: "instruction"
39
+
40
+ question_answering_prompt:
41
+ prompt: [
42
+ "Answer the following question : {question}.",
43
+ "Answer in French.",
44
+ "Use bullet points",
45
+ ]
46
+ type: "prompt"
47
+
48
+ reformulation_prompt:
49
+ prompt:
50
+ [
51
+ # "Here is what happened in the conversation so far",
52
+ # "<summary>",
53
+ # "{memory}",
54
+ # "</summary>",
55
+ # "",
56
+ "Reformulate the following user message to be a short standalone question in English.",
57
+ "The question is related to science.",
58
+ "If relevant, use the conversation summary to add context",
59
+ "If the question is too vague, just say it as it is",
60
+ "",
61
+ "Exemples:",
62
+ "---",
63
+ "user",
64
+ "La technologie nous sauvera-t-elle ?",
65
+ "",
66
+ "assistant",
67
+ "Can technology help humanity mitigate the effects of climate change?",
68
+ "---",
69
+ "user",
70
+ "what are our reserves in fossil fuel?",
71
+ "",
72
+ "assistant",
73
+ "What are the current reserves of fossil fuels and how long will they last?",
74
+ "---",
75
+ "user",
76
+ "what are the main causes of climate change?",
77
+ "",
78
+ "assistant",
79
+ "What are the main causes of climate change in the last century?",
80
+ "---",
81
+ "user",
82
+ "{question}",
83
+ "",
84
+ ]
85
+ type: prompt
86
+
87
+ summarize_memory_prompt:
88
+ prompt:
89
+ [
90
+ "Summarize the following exchange as concisely as possible to be used by a language model",
91
+ "<conversation>",
92
+ "{memory}",
93
+ "</conversation>",
94
+ ]
95
+ type: prompt
app/source/backend/document_store.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client.http import models
2
+ import pickle as pickle
3
+ import torch
4
+ import io
5
+
6
+ device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ device = torch.device(device_str)
8
+
9
+
10
+ class Device_Unpickler(pickle.Unpickler):
11
+
12
+ def find_class(self, module, name):
13
+ if module == "torch.storage" and name == "_load_from_bytes":
14
+ return lambda b: torch.load(io.BytesIO(b), map_location=device_str)
15
+ else:
16
+ return super().find_class(module, name)
17
+
18
+
19
+ def pickle_to_document_store(path):
20
+ with open(path, "rb") as f:
21
+ document_store = Device_Unpickler(f).load()
22
+ document_store.embeddings.encode_kwargs["device"] = device_str
23
+ return document_store
24
+
25
+
26
+ def get_qdrant_filters(filter_dict: dict):
27
+ """Build a Qdrant filter based on a filter dict.
28
+
29
+ Filter dict must use metadata fields and be formated like:
30
+
31
+ filter_dict = {'file_name':['file1', 'file2'],'sub_type':['text']}
32
+ """
33
+ return models.Filter(
34
+ must=[
35
+ models.FieldCondition(
36
+ key=f"metadata.{field}",
37
+ match=models.MatchAny(any=filter_dict[field]),
38
+ )
39
+ for field in filter_dict
40
+ ]
41
+ )
app/source/backend/get_prompts.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.source.backend.prompt_utils import SpecialTokens, make_chat_prompt
2
+ from langchain.prompts.chat import ChatPromptTemplate
3
+
4
+
5
+ def get_qa_prompts(config, prompts):
6
+ special_tokens = SpecialTokens(config)
7
+ role_instruction = make_chat_prompt(prompts["role_instruction"], special_tokens)
8
+ source_prompt = make_chat_prompt(prompts["source_prompt"], special_tokens)
9
+ # memory_prompt=make_chat_prompt(prompts['memory_prompt'], special_tokens)
10
+ question_answering_prompt = make_chat_prompt(
11
+ prompts["question_answering_prompt"], special_tokens
12
+ )
13
+ reformulation_prompt = make_chat_prompt(
14
+ prompts["reformulation_prompt"], special_tokens
15
+ )
16
+ summarize_memory_prompt = make_chat_prompt(
17
+ prompts["summarize_memory_prompt"], special_tokens
18
+ )
19
+
20
+ chat_qa_prompt = ChatPromptTemplate.from_messages(
21
+ [
22
+ role_instruction,
23
+ source_prompt,
24
+ # memory_prompt,
25
+ question_answering_prompt,
26
+ ]
27
+ )
28
+ chat_reformulation_prompt = ChatPromptTemplate.from_messages([reformulation_prompt])
29
+ # chat_summarize_memory_prompt = ChatPromptTemplate.from_messages([summarize_memory_prompt])
30
+ return (
31
+ chat_qa_prompt,
32
+ chat_reformulation_prompt,
33
+ ) # , chat_summarize_memory_prompt
app/source/backend/llm_utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import AzureChatOpenAI
2
+ from msal import ConfidentialClientApplication
3
+ import os
4
+
5
+
6
+ class LLM:
7
+ def __init__(self, llm):
8
+ self.llm = llm
9
+ self.callbacks = []
10
+
11
+ def stream(self, prompt, prompt_arguments):
12
+ self.llm.streaming = True
13
+ streamed_content = self.llm.stream(prompt.format_messages(**prompt_arguments))
14
+ output = ""
15
+ for op in streamed_content:
16
+ output += op.content
17
+ yield output
18
+
19
+ def get_prediction(self, prompt, prompt_arguments):
20
+ self.llm.callbacks = self.callbacks
21
+ return self.llm.predict_messages(
22
+ prompt.format_messages(**prompt_arguments)
23
+ ).content
24
+
25
+ async def get_aprediction(self, prompt, prompt_arguments):
26
+ self.llm.callbacks = self.callbacks
27
+ prediction = await self.llm.apredict_messages(
28
+ prompt.format_messages(**prompt_arguments)
29
+ )
30
+ return prediction
31
+
32
+ async def get_apredictions(self, prompts, prompts_arguments):
33
+ self.llm.callbacks = self.callbacks
34
+ predictions = []
35
+ for prompt_, prompt_args_ in zip(prompts.keys(), prompts_arguments):
36
+ prediction = await self.llm.apredict_messages(
37
+ prompts[prompt_].format_messages(**prompt_args_)
38
+ )
39
+ predictions.append(prediction.content)
40
+ return predictions
41
+
42
+
43
+ def get_token() -> str | None:
44
+ app = ConfidentialClientApplication(
45
+ client_id=os.getenv("CLIENT_ID"),
46
+ client_credential=os.getenv("CLIENT_SECRET"),
47
+ authority=f"https://login.microsoftonline.com/{os.getenv('TENANT_ID')}",
48
+ )
49
+ result = app.acquire_token_for_client(scopes=[os.getenv("SCOPE")])
50
+ if result is not None:
51
+ return result["access_token"]
52
+
53
+
54
+ def get_llm():
55
+ os.environ["OPENAI_API_KEY"] = get_token()
56
+ os.environ["AZURE_OPENAI_ENDPOINT"] = (
57
+ f"{os.getenv('OPENAI_API_ENDPOINT')}{os.getenv('DEPLOYMENT_ID')}/chat/completions?api-version={os.getenv('OPENAI_API_VERSION')}"
58
+ )
59
+
60
+ return LLM(AzureChatOpenAI())
app/source/backend/prompt_utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts.chat import ChatMessagePromptTemplate
2
+
3
+
4
+ class SpecialTokens:
5
+ def __init__(self, config):
6
+ self.user_token = config["user_token"]
7
+ self.assistant_token = config["assistant_token"]
8
+ self.system_token = config["system_token"]
9
+ self.stop_token = config["stop_token"]
10
+
11
+
12
+ def to_instruction(query, special_tokens):
13
+ return special_tokens.user_token + query + special_tokens.stop_token
14
+
15
+
16
+ def to_prompt(query, special_tokens):
17
+ return (
18
+ special_tokens.user_token
19
+ + query
20
+ + special_tokens.stop_token
21
+ + special_tokens.assistant_token
22
+ )
23
+
24
+
25
+ def to_system(query, special_tokens):
26
+ return special_tokens.system_token + query + special_tokens.stop_token
27
+
28
+
29
+ def make_prompt(prompt, special_tokens):
30
+ prompt_type = prompt["type"]
31
+ if prompt_type == "system":
32
+ return to_system("\n".join(prompt["prompt"]), special_tokens)
33
+ elif prompt_type == "instruction":
34
+ return to_instruction("\n".join(prompt["prompt"]), special_tokens)
35
+ elif prompt_type == "prompt":
36
+ return to_prompt("\n".join(prompt["prompt"]), special_tokens)
37
+ else:
38
+ return "Invalid prompt type, please check your config"
39
+
40
+
41
+ def to_chat_instruction(query, special_tokens):
42
+ return ChatMessagePromptTemplate.from_template(
43
+ query, role=special_tokens.user_token
44
+ )
45
+
46
+
47
+ def to_chat_system(query, special_tokens):
48
+ return ChatMessagePromptTemplate.from_template(
49
+ query, role=special_tokens.system_token
50
+ )
51
+
52
+
53
+ def to_chat_prompt(query, special_tokens):
54
+ return ChatMessagePromptTemplate.from_template(
55
+ query, role=special_tokens.user_token
56
+ )
57
+
58
+
59
+ def make_chat_prompt(prompt, special_tokens):
60
+ prompt_type = prompt["type"]
61
+ if prompt_type == "system":
62
+ return to_chat_system("\n".join(prompt["prompt"]), special_tokens)
63
+ elif prompt_type == "instruction":
64
+ return to_chat_instruction("\n".join(prompt["prompt"]), special_tokens)
65
+ elif prompt_type == "prompt":
66
+ return to_chat_prompt("\n".join(prompt["prompt"]), special_tokens)
67
+ else:
68
+ return "Invalid prompt type, please check your config"
app/source/frontend/utils.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from queue import SimpleQueue
2
+ from dotenv import load_dotenv
3
+ from langchain.callbacks.base import BaseCallbackHandler
4
+
5
+ job_done = object() # signals the processing is done
6
+
7
+
8
+ class StreamingGradioCallbackHandler(BaseCallbackHandler):
9
+ """Callback handler for streaming. Only works with LLMs that support streaming."""
10
+
11
+ def __init__(self, q):
12
+ self.q = q
13
+
14
+ def on_llm_start(self, serialized, prompts, **kwargs) -> None:
15
+ """Run when LLM starts running."""
16
+ while not self.q.empty():
17
+ try:
18
+ self.q.get(block=False)
19
+ except SimpleQueue.empty:
20
+ continue
21
+
22
+ def on_llm_new_token(self, token, **kwargs) -> None:
23
+ """Run on new LLM token. Only available when streaming is enabled."""
24
+ self.q.put(token)
25
+
26
+ def on_llm_end(self, response, **kwargs) -> None:
27
+ """Run when LLM ends running."""
28
+ self.q.put(job_done)
29
+
30
+ def on_llm_error(self, error, **kwargs) -> None:
31
+ """Run when LLM errors."""
32
+ self.q.put(job_done)
33
+
34
+
35
+ def add_gradio_streaming(llm):
36
+ q = SimpleQueue()
37
+ job_done = object() # signals the processing is done
38
+ llm.callbacks = [StreamingGradioCallbackHandler(q)]
39
+ return llm, q
40
+
41
+
42
+ def gradio_stream(llm, prompt):
43
+ thread = Thread(target=llm.predict, kwargs={"text": prompt})
44
+ thread.start()
45
+ text = ""
46
+ while True:
47
+ next_token = q.get(block=True) # Blocks until an input is available
48
+ if next_token is job_done:
49
+ break
50
+ text += next_token
51
+ time.sleep(0.03)
52
+ yield text
53
+ thread.join()
54
+
55
+
56
+ def get_source_link(metadata):
57
+ return metadata["file_url"] + f"#page={metadata['content_page_number'] + 1}"
58
+
59
+
60
+ def make_html_presse_source(source, i, tag, score, config):
61
+ meta = source.metadata
62
+ return f"""
63
+ <div class="card" id="btn_{tag}_{i}_source" style="display:none;">
64
+ <div class="card-content">
65
+ <div class="card-content-column-1">
66
+ <p><strong>Titre :</strong> {meta['file_title']}</p>
67
+ <p><strong>Auteur/s :</strong> {meta['file_source_type']}</p>
68
+ <p><strong>Date :</strong> {meta['file_date_publishing']}</p>
69
+ </div>
70
+ <div class="card-content-column-2">
71
+ <p><strong>Paragraphe id :</strong> {source.page_content.replace(config["passage_preprompt"], "")}</p>
72
+ </div>
73
+ </div>
74
+ <div class="card-footer">
75
+ <span>[{i}]</span>
76
+ <span>Relevance Score : {round(100*score,1)}%</span>
77
+ </div>
78
+ </div>
79
+ """
80
+
81
+
82
+ def make_html_source(source, i, tag, score, config):
83
+ meta = source.metadata
84
+ return f"""
85
+ <div class="card" id="btn_{tag}_{i}_source" style="display:none;">
86
+ <div class="card-content">
87
+ <div class="card-content-column-1">
88
+ <p><strong>Titre :</strong> {meta['file_title']}</p>
89
+ <p><strong>Auteur/s :</strong> {meta['file_source_type']}</p>
90
+ <p><strong>Date :</strong> {meta['file_date_publishing']}</p>
91
+ </div>
92
+ <div class="card-content-column-2">
93
+ <p><strong>Paragraphe id :</strong> {source.page_content.replace(config["passage_preprompt"], "")}</p>
94
+ </div>
95
+ </div>
96
+ <div class="card-footer">
97
+ <span>[{i}]</span>
98
+ <span><a href="{get_source_link(meta)}" target="_blank">Lien source</a></span>
99
+ <span>Page {meta['content_page_number'] + 1}</span>
100
+ <span>Relevance Score : {round(100*score,1)}%</span>
101
+ </div>
102
+ </div>
103
+ """
104
+
105
+
106
+ def clear_text_box(textbox):
107
+ return ""
108
+
109
+
110
+ def add_text(chatbot, text):
111
+ chatbot = chatbot + [(text, None)]
112
+ return chatbot, text
113
+
114
+
115
+ def init_env():
116
+ try:
117
+ load_dotenv()
118
+ except:
119
+ pass
assets/style.css ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
3
+ }
4
+
5
+ .warning-box {
6
+ background-color: #fff3cd;
7
+ border: 1px solid #ffeeba;
8
+ border-radius: 4px;
9
+ padding: 15px 20px;
10
+ font-size: 14px;
11
+ color: #856404;
12
+ display: inline-block;
13
+ margin-bottom: 15px;
14
+ }
15
+
16
+ .tip-box {
17
+ background-color: #f0f9ff;
18
+ border: 1px solid #80d4fa;
19
+ border-radius: 4px;
20
+ margin-top:20px;
21
+ padding: 15px 20px;
22
+ font-size: 14px;
23
+ color: #006064;
24
+ display: inline-block;
25
+ margin-bottom: 15px;
26
+ width: auto;
27
+ }
28
+
29
+ .tip-box-title {
30
+ font-weight: bold;
31
+ font-size: 14px;
32
+ margin-bottom: 5px;
33
+ }
34
+
35
+ .light-bulb {
36
+ display: inline;
37
+ margin-right: 5px;
38
+ }
39
+
40
+ .gr-box {border-color: #d6c37c}
41
+
42
+ #hidden-message{
43
+ display:none;
44
+ }
45
+
46
+ .message{
47
+ font-size:14px !important;
48
+ }
49
+
50
+
51
+ a {
52
+ text-decoration: none;
53
+ color: inherit;
54
+ }
55
+
56
+ .card {
57
+ background-color: white;
58
+ border-radius: 10px;
59
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
60
+ overflow: hidden;
61
+ display: flex;
62
+ flex-direction: column;
63
+ margin:20px;
64
+ }
65
+
66
+ .card-content {
67
+ padding: 20px;
68
+ }
69
+
70
+ .card-content h2 {
71
+ font-size: 14px !important;
72
+ font-weight: bold;
73
+ margin-bottom: 10px;
74
+ margin-top:0px !important;
75
+ color:#577b9b!important;;
76
+ }
77
+
78
+ .card-content p {
79
+ font-size: 12px;
80
+ margin-bottom: 0;
81
+ }
82
+
83
+ .card-content-column-1 {
84
+ float: left;
85
+ width: 20%;
86
+ }
87
+
88
+ .card-content-column-2 {
89
+ float: left;
90
+ width: 80%;
91
+ }
92
+
93
+ .card-footer {
94
+ background-color: #f4f4f4;
95
+ font-size: 10px;
96
+ padding: 10px;
97
+ display: flex;
98
+ justify-content: space-between;
99
+ align-items: center;
100
+ }
101
+
102
+ .card-footer span {
103
+ flex-grow: 1;
104
+ text-align: left;
105
+ color: #999 !important;
106
+ }
107
+
108
+ .pdf-link {
109
+ display: inline-flex;
110
+ align-items: center;
111
+ margin-left: auto;
112
+ text-decoration: none!important;
113
+ font-size: 14px;
114
+ }
115
+
116
+ .message.user{
117
+ background-color:#7494b0 !important;
118
+ border:none;
119
+ color:white!important;
120
+ }
121
+
122
+ .message.bot{
123
+ background-color:#f2f2f7 !important;
124
+ border:none;
125
+ }
126
+
127
+ .gallery-item > div:hover{
128
+ background-color:#7494b0 !important;
129
+ color:white!important;
130
+ }
131
+
132
+ .gallery-item:hover{
133
+ border:#7494b0 !important;
134
+ }
135
+
136
+ .gallery-item > div{
137
+ background-color:white !important;
138
+ color:#577b9b!important;
139
+ }
140
+
141
+ .label{
142
+ color:#577b9b!important;
143
+ }
144
+
145
+ label.selected{
146
+ background:none !important;
147
+ }
148
+
149
+ .paginate{
150
+ color:#577b9b!important;
151
+ }
152
+
153
+
154
+ label > span{
155
+ background-color:white !important;
156
+ color:#577b9b!important;
157
+ }
158
+
159
+ /* Pseudo-element for the circularly cropped picture */
160
+ .message.bot::before {
161
+ content: '';
162
+ position: absolute;
163
+ top: -10px;
164
+ left: -10px;
165
+ width: 30px;
166
+ height: 30px;
167
+ background-image: var(--user-image);
168
+ background-size: cover;
169
+ background-position: center;
170
+ border-radius: 50%;
171
+ z-index: 10;
172
+ }
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "spinoza-project"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Miguel Omenaca Muro <miguel.omenacamuro@ekimetrics.com>"]
6
+ readme = "README.md"
7
+ package-mode = false
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ langchain = "^0.2.5"
12
+ gradio = {extras = ["oauth"], version = "^4.36.1"}
13
+ sentence-transformers = "2.2.2"
14
+ msal = "^1.28.1"
15
+ langchain-openai = "^0.1.8"
16
+ qdrant-client = "^1.9.1"
17
+ loadenv = "^0.1.1"
18
+ datasets = "^2.20.0"
19
+ langchain-community = "^0.2.5"
20
+ transformers = "4.39.0"
21
+
22
+
23
+ [build-system]
24
+ requires = ["poetry-core"]
25
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
The diff for this file is too large to render. See raw diff