Pclanglais commited on
Commit
f2019a4
1 Parent(s): eea75fc

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +7 -6
  2. app.py +216 -0
  3. gitattributes +36 -0
  4. requirements.txt +14 -0
  5. theme_builder.py +3 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Tchap
3
- emoji: 📊
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.31.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Guillaume-Tell
3
+ emoji: 📜
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 3.50.2
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import re
3
+ from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
4
+ from vllm import LLM, SamplingParams
5
+ import torch
6
+ import gradio as gr
7
+ import json
8
+ import os
9
+ import shutil
10
+ import requests
11
+ import chromadb
12
+ import pandas as pd
13
+ from chromadb.config import Settings
14
+ from chromadb.utils import embedding_functions
15
+
16
+ device = "cuda:0"
17
+ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="intfloat/multilingual-e5-base", device = "cuda")
18
+ client = chromadb.PersistentClient(path="mfs_vector")
19
+ collection = client.get_collection(name="sp_expanded", embedding_function = sentence_transformer_ef)
20
+
21
+
22
+ # Define the device
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ #Define variables
25
+ temperature=0.2
26
+ max_new_tokens=1000
27
+ top_p=0.92
28
+ repetition_penalty=1.7
29
+
30
+ model_name = "AgentPublic/Guillaume-Tell"
31
+
32
+ llm = LLM(model_name, max_model_len=4096)
33
+
34
+ #Vector search over the database
35
+ def vector_search(collection, text):
36
+
37
+ results = collection.query(
38
+ query_texts=[text],
39
+ n_results=5,
40
+ )
41
+
42
+ document = []
43
+ document_html = []
44
+ id_list = ""
45
+ list_elm = 0
46
+ for ids in results["ids"][0]:
47
+ first_link = str(results["metadatas"][0][list_elm]["identifier"])
48
+ first_title = results["documents"][0][list_elm]
49
+ list_elm = list_elm+1
50
+
51
+ document.append(first_link + " : " + first_title)
52
+ document_html.append('<div class="source" id="' + first_link + '"><p><b>' + first_link + "</b> : " + first_title + "</div>")
53
+
54
+ document = "\n\n".join(document)
55
+ document_html = '<div id="source_listing">' + "".join(document_html) + "</div>"
56
+ # Replace this with the actual implementation of the vector search
57
+ return document, document_html
58
+
59
+ #CSS for references formatting
60
+ css = """
61
+ .generation {
62
+ margin-left:2em;
63
+ margin-right:2em;
64
+ size:1.2em;
65
+ }
66
+
67
+ :target {
68
+ background-color: #CCF3DF; /* Change the text color to red */
69
+ }
70
+
71
+ .source {
72
+ float:left;
73
+ max-width:17%;
74
+ margin-left:2%;
75
+ }
76
+
77
+ .tooltip {
78
+ position: relative;
79
+ cursor: pointer;
80
+ font-variant-position: super;
81
+ color: #97999b;
82
+ }
83
+
84
+ .tooltip:hover::after {
85
+ content: attr(data-text);
86
+ position: absolute;
87
+ left: 0;
88
+ top: 120%; /* Adjust this value as needed to control the vertical spacing between the text and the tooltip */
89
+ white-space: pre-wrap; /* Allows the text to wrap */
90
+ width: 500px; /* Sets a fixed maximum width for the tooltip */
91
+ max-width: 500px; /* Ensures the tooltip does not exceed the maximum width */
92
+ z-index: 1;
93
+ background-color: #f9f9f9;
94
+ color: #000;
95
+ border: 1px solid #ddd;
96
+ border-radius: 5px;
97
+ padding: 5px;
98
+ display: block;
99
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1); /* Optional: Adds a subtle shadow for better visibility */
100
+ }"""
101
+
102
+ #Curtesy of chatgpt
103
+ def format_references(text):
104
+ # Define start and end markers for the reference
105
+ ref_start_marker = '<ref text="'
106
+ ref_end_marker = '</ref>'
107
+
108
+ # Initialize an empty list to hold parts of the text
109
+ parts = []
110
+ current_pos = 0
111
+ ref_number = 1
112
+
113
+ # Loop until no more reference start markers are found
114
+ while True:
115
+ start_pos = text.find(ref_start_marker, current_pos)
116
+ if start_pos == -1:
117
+ # No more references found, add the rest of the text
118
+ parts.append(text[current_pos:])
119
+ break
120
+
121
+ # Add text up to the start of the reference
122
+ parts.append(text[current_pos:start_pos])
123
+
124
+ # Find the end of the reference text attribute
125
+ end_pos = text.find('">', start_pos)
126
+ if end_pos == -1:
127
+ # Malformed reference, break to avoid infinite loop
128
+ break
129
+
130
+ # Extract the reference text
131
+ ref_text = text[start_pos + len(ref_start_marker):end_pos].replace('\n', ' ').strip()
132
+ ref_text_encoded = ref_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
133
+
134
+ # Find the end of the reference tag
135
+ ref_end_pos = text.find(ref_end_marker, end_pos)
136
+ if ref_end_pos == -1:
137
+ # Malformed reference, break to avoid infinite loop
138
+ break
139
+
140
+ # Extract the reference ID
141
+ ref_id = text[end_pos + 2:ref_end_pos].strip()
142
+
143
+ # Create the HTML for the tooltip
144
+ tooltip_html = f'<span class="tooltip" data-refid="{ref_id}" data-text="{ref_id}: {ref_text_encoded}"><a href="#{ref_id}">[' + str(ref_number) +']</a></span>'
145
+ parts.append(tooltip_html)
146
+
147
+ # Update current_pos to the end of the current reference
148
+ current_pos = ref_end_pos + len(ref_end_marker)
149
+ ref_number = ref_number + 1
150
+
151
+ # Join and return the parts
152
+ parts = ''.join(parts)
153
+
154
+ return parts
155
+
156
+ # Class to encapsulate the Falcon chatbot
157
+ class MistralChatBot:
158
+ def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
159
+ self.system_prompt = system_prompt
160
+
161
+ def predict(self, user_message):
162
+ fiches, fiches_html = vector_search(collection, user_message)
163
+ sampling_params = SamplingParams(temperature=.7, top_p=.95, max_tokens=2000, presence_penalty = 1.5, stop = ["``"])
164
+ detailed_prompt = """<|im_start|>system
165
+ Tu es Albert, le chatbot des Maisons France Service qui donne des réponses sourcées.<|im_end|>
166
+ <|im_start|>user
167
+ Ecrit un texte référencé en réponse à cette question : """ + user_message + """
168
+
169
+ Les références doivent être citées de cette manière : texte rédigé<ref text=\"[passage pertinent dans la référence]\">[\"identifiant de la référence\"]</ref>Si les références ne permettent pas de répondre, qu'il n'y a pas de réponse.
170
+
171
+ Les cinq références disponibles : """ + fiches + "<|im_end|>\n<|im_start|>assistant\n"
172
+ print(detailed_prompt)
173
+ prompts = [detailed_prompt]
174
+ outputs = llm.generate(prompts, sampling_params, use_tqdm = False)
175
+ generated_text = outputs[0].outputs[0].text
176
+ generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + format_references(generated_text) + "</div>"
177
+ fiches_html = '<h2 style="text-align:center">Sources</h3>\n' + fiches_html
178
+ return generated_text, fiches_html
179
+
180
+ # Create the Falcon chatbot instance
181
+ mistral_bot = MistralChatBot()
182
+
183
+ # Define the Gradio interface
184
+ title = "Guillaume-Tell"
185
+ description = "Le LLM répond à des questions administratives sur l'éducation nationale à partir de sources fiables."
186
+ examples = [
187
+ [
188
+ "Qui peut bénéficier de l'AIP?", # user_message
189
+ 0.7 # temperature
190
+ ]
191
+ ]
192
+
193
+ additional_inputs=[
194
+ gr.Slider(
195
+ label="Température",
196
+ value=0.2, # Default value
197
+ minimum=0.05,
198
+ maximum=1.0,
199
+ step=0.05,
200
+ interactive=True,
201
+ info="Des valeurs plus élevées donne plus de créativité, mais aussi d'étrangeté",
202
+ ),
203
+ ]
204
+
205
+ demo = gr.Blocks()
206
+
207
+ with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=css) as demo:
208
+ gr.HTML("""<h1 style="text-align:center">Albert (Guillaume-Tell)</h1>""")
209
+ text_input = gr.Textbox(label="Votre question ou votre instruction.", type="text", lines=1)
210
+ text_button = gr.Button("Interroger Albert")
211
+ text_output = gr.HTML(label="La réponse d'Albert")
212
+ embedding_output = gr.HTML(label="Les sources utilisées")
213
+ text_button.click(mistral_bot.predict, inputs=text_input, outputs=[text_output, embedding_output])
214
+
215
+ if __name__ == "__main__":
216
+ demo.queue().launch()
gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ education_database/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ einops
4
+ accelerate
5
+ tiktoken
6
+ scipy
7
+ transformers_stream_generator==0.0.4
8
+ peft
9
+ deepspeed
10
+ bitsandbytes
11
+ optimum
12
+ vllm==0.3.2
13
+ chromadb
14
+ sentence_transformers
theme_builder.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import gradio as gr
2
+
3
+ gr.themes.builder()