Mollel commited on
Commit
43be956
โ€ข
1 Parent(s): 2507728

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +400 -0
app.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apscheduler.schedulers.background import BackgroundScheduler
2
+ import datetime
3
+ import os
4
+ from typing import Dict, Tuple
5
+ from uuid import UUID
6
+
7
+ import altair as alt
8
+ import argilla as rg
9
+ from argilla.feedback import FeedbackDataset
10
+ from argilla.client.feedback.dataset.remote.dataset import RemoteFeedbackDataset
11
+ import gradio as gr
12
+ import pandas as pd
13
+
14
+ """
15
+ This is the main file for the dashboard application. It contains the main function and the functions to obtain the data and create the charts.
16
+ It's designed as a template to recreate the dashboard for the prompt translation project of any language.
17
+
18
+ To create a new dashboard, you need several environment variables, that you can easily set in the HuggingFace Space that you are using to host the dashboard:
19
+
20
+ - SOURCE_DATASET: The dataset id of the source dataset
21
+ - SOURCE_WORKSPACE: The workspace id of the source dataset
22
+ - TARGET_RECORDS: The number of records that you have as a target to annotate. We usually set this to 500.
23
+ - ARGILLA_API_URL: Link to the Huggingface Space where the annotation effort is being hosted. For example, the Spanish one is https://somosnlp-dibt-prompt-translation-for-es.hf.space/
24
+ - ARGILLA_API_KEY: The API key to access the Huggingface Space. Please, write this as a secret in the Huggingface Space configuration.
25
+ """
26
+
27
+ # Translation of legends and titles
28
+ ANNOTATED = 'Annotations'
29
+ NUMBER_ANNOTATED = 'Total Annotations'
30
+ PENDING = 'Pending'
31
+
32
+ NUMBER_ANNOTATORS = "Number of annotators"
33
+ NAME = 'Username'
34
+ NUMBER_ANNOTATIONS = 'Number of annotations'
35
+
36
+ CATEGORY = 'Category'
37
+
38
+ def obtain_source_target_datasets() -> (
39
+ Tuple[
40
+ FeedbackDataset | RemoteFeedbackDataset, FeedbackDataset | RemoteFeedbackDataset
41
+ ]
42
+ ):
43
+ """
44
+ This function returns the source and target datasets to be used in the application.
45
+
46
+ Returns:
47
+ A tuple with the source and target datasets. The source dataset is filtered by the response status 'pending'.
48
+
49
+ """
50
+
51
+ # Obtain the public dataset and see how many pending records are there
52
+ source_dataset = rg.FeedbackDataset.from_argilla(
53
+ os.getenv("SOURCE_DATASET"), workspace=os.getenv("SOURCE_WORKSPACE")
54
+ )
55
+ filtered_source_dataset = source_dataset.filter_by(response_status=["pending"])
56
+
57
+ # Obtain a list of users from the private workspace
58
+ # target_dataset = rg.FeedbackDataset.from_argilla(
59
+ # os.getenv("RESULTS_DATASET"), workspace=os.getenv("RESULTS_WORKSPACE")
60
+ # )
61
+
62
+ target_dataset = source_dataset.filter_by(response_status=["submitted"])
63
+
64
+ return filtered_source_dataset, target_dataset
65
+
66
+
67
+ def get_user_annotations_dictionary(
68
+ dataset: FeedbackDataset | RemoteFeedbackDataset,
69
+ ) -> Dict[str, int]:
70
+ """
71
+ This function returns a dictionary with the username as the key and the number of annotations as the value.
72
+
73
+ Args:
74
+ dataset: The dataset to be analyzed.
75
+ Returns:
76
+ A dictionary with the username as the key and the number of annotations as the value.
77
+ """
78
+ output = {}
79
+ for record in dataset:
80
+ for response in record.responses:
81
+ if str(response.user_id) not in output.keys():
82
+ output[str(response.user_id)] = 1
83
+ else:
84
+ output[str(response.user_id)] += 1
85
+
86
+ # Changing the name of the keys, from the id to the username
87
+ for key in list(output.keys()):
88
+ output[rg.User.from_id(UUID(key)).username] = output.pop(key)
89
+
90
+ return output
91
+
92
+
93
+ def donut_chart_total() -> alt.Chart:
94
+ """
95
+ This function returns a donut chart with the progress of the total annotations.
96
+ Counts each record that has been annotated at least once.
97
+
98
+ Returns:
99
+ An altair chart with the donut chart.
100
+ """
101
+
102
+ # Load your data
103
+ annotated_records = len(target_dataset)
104
+ pending_records = int(os.getenv("TARGET_RECORDS")) - annotated_records
105
+
106
+ # Prepare data for the donut chart
107
+ source = pd.DataFrame(
108
+ {
109
+ "values": [annotated_records, pending_records],
110
+ "category": [ANNOTATED, PENDING],
111
+ "colors": ["#4682b4", "#e68c39"], # Blue for Completed, Orange for Remaining
112
+ }
113
+ )
114
+
115
+ domain = source['category'].tolist()
116
+ range_ = source['colors'].tolist()
117
+
118
+ base = alt.Chart(source).encode(
119
+ theta=alt.Theta("values:Q", stack=True),
120
+ radius=alt.Radius(
121
+ "values", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20)
122
+ ),
123
+ color=alt.Color(field="category", type="nominal", scale=alt.Scale(domain=domain, range=range_), legend=alt.Legend(title=CATEGORY)),
124
+ )
125
+
126
+ c1 = base.mark_arc(innerRadius=20, stroke="#fff")
127
+
128
+ c2 = base.mark_text(radiusOffset=20).encode(text="values:Q")
129
+
130
+ chart = c1 + c2
131
+
132
+ return chart
133
+
134
+
135
+ def kpi_chart_remaining() -> alt.Chart:
136
+ """
137
+ This function returns a KPI chart with the remaining amount of records to be annotated.
138
+ Returns:
139
+ An altair chart with the KPI chart.
140
+ """
141
+
142
+ pending_records = int(os.getenv("TARGET_RECORDS")) - len(target_dataset)
143
+ # Assuming you have a DataFrame with user data, create a sample DataFrame
144
+ data = pd.DataFrame({"Category": [PENDING], "Value": [pending_records]})
145
+
146
+ # Create Altair chart
147
+ chart = (
148
+ alt.Chart(data)
149
+ .mark_text(fontSize=100, align="center", baseline="middle", color="#e68b39")
150
+ .encode(text="Value:N")
151
+ .properties(title=PENDING, width=250, height=200)
152
+ )
153
+
154
+ return chart
155
+
156
+
157
+ def kpi_chart_submitted() -> alt.Chart:
158
+ """
159
+ This function returns a KPI chart with the total amount of records that have been annotated.
160
+ Returns:
161
+ An altair chart with the KPI chart.
162
+ """
163
+
164
+ total = len(target_dataset)
165
+
166
+ # Assuming you have a DataFrame with user data, create a sample DataFrame
167
+ data = pd.DataFrame({"Category": [NUMBER_ANNOTATED], "Value": [total]})
168
+
169
+ # Create Altair chart
170
+ chart = (
171
+ alt.Chart(data)
172
+ .mark_text(fontSize=100, align="center", baseline="middle", color="steelblue")
173
+ .encode(text="Value:N")
174
+ .properties(title=NUMBER_ANNOTATED, width=250, height=200)
175
+ )
176
+
177
+ return chart
178
+
179
+
180
+ def kpi_chart_total_annotators() -> alt.Chart:
181
+ """
182
+ This function returns a KPI chart with the total amount of annotators.
183
+
184
+ Returns:
185
+ An altair chart with the KPI chart.
186
+ """
187
+
188
+ # Obtain the total amount of annotators
189
+ total_annotators = len(user_ids_annotations)
190
+
191
+ # Assuming you have a DataFrame with user data, create a sample DataFrame
192
+ data = pd.DataFrame(
193
+ {"Category": [NUMBER_ANNOTATORS], "Value": [total_annotators]}
194
+ )
195
+
196
+ # Create Altair chart
197
+ chart = (
198
+ alt.Chart(data)
199
+ .mark_text(fontSize=100, align="center", baseline="middle", color="steelblue")
200
+ .encode(text="Value:N")
201
+ .properties(title=NUMBER_ANNOTATORS, width=250, height=200)
202
+ )
203
+
204
+ return chart
205
+
206
+
207
+ def render_hub_user_link(hub_id:str) -> str:
208
+ """
209
+ This function returns a link to the user's profile on Hugging Face.
210
+
211
+ Args:
212
+ hub_id: The user's id on Hugging Face.
213
+
214
+ Returns:
215
+ A string with the link to the user's profile on Hugging Face.
216
+ """
217
+ link = f"https://huggingface.co/{hub_id}"
218
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
219
+
220
+
221
+ def obtain_top_users(user_ids_annotations: Dict[str, int], N: int = 50) -> pd.DataFrame:
222
+ """
223
+ This function returns the top N users with the most annotations.
224
+
225
+ Args:
226
+ user_ids_annotations: A dictionary with the user ids as the key and the number of annotations as the value.
227
+
228
+ Returns:
229
+ A pandas dataframe with the top N users with the most annotations.
230
+ """
231
+
232
+ dataframe = pd.DataFrame(
233
+ user_ids_annotations.items(), columns=[NAME, NUMBER_ANNOTATIONS]
234
+ )
235
+ dataframe[NAME] = dataframe[NAME].apply(render_hub_user_link)
236
+ dataframe = dataframe.sort_values(by=NUMBER_ANNOTATIONS, ascending=False)
237
+ return dataframe.head(N)
238
+
239
+
240
+ def fetch_data() -> None:
241
+ """
242
+ This function fetches the data from the source and target datasets and updates the global variables.
243
+ """
244
+
245
+ print(f"Starting to fetch data: {datetime.datetime.now()}")
246
+
247
+ global source_dataset, target_dataset, user_ids_annotations, annotated, remaining, percentage_completed, top_dataframe
248
+ source_dataset, target_dataset = obtain_source_target_datasets()
249
+ user_ids_annotations = get_user_annotations_dictionary(target_dataset)
250
+
251
+ annotated = len(target_dataset)
252
+ remaining = int(os.getenv("TARGET_RECORDS")) - annotated
253
+ percentage_completed = round(
254
+ (annotated / int(os.getenv("TARGET_RECORDS"))) * 100, 1
255
+ )
256
+
257
+ # Print the current date and time
258
+ print(f"Data fetched: {datetime.datetime.now()}")
259
+
260
+
261
+ def get_top(N = 50) -> pd.DataFrame:
262
+ """
263
+ This function returns the top N users with the most annotations.
264
+
265
+ Args:
266
+ N: The number of users to be returned. 50 by default
267
+
268
+ Returns:
269
+ A pandas dataframe with the top N users with the most annotations.
270
+ """
271
+
272
+ return obtain_top_users(user_ids_annotations, N=N)
273
+
274
+
275
+ def main() -> None:
276
+
277
+ # Connect to the space with rg.init()
278
+ rg.init(
279
+ api_url=os.getenv("ARGILLA_API_URL"),
280
+ api_key=os.getenv("ARGILLA_API_KEY"),
281
+ )
282
+
283
+ # Fetch the data initially
284
+ fetch_data()
285
+
286
+ # To avoid the orange border for the Gradio elements that are in constant loading
287
+ css = """
288
+ .generating {
289
+ border: none;
290
+ }
291
+ """
292
+
293
+ COUNTRY_FLAGS = {
294
+ "Tanzania": "๐Ÿ‡น๐Ÿ‡ฟ",
295
+ "Kenya": "๐Ÿ‡ฐ๐Ÿ‡ช",
296
+ "Democratic Republic of the Congo": "๐Ÿ‡จ๐Ÿ‡ฉ",
297
+ "Uganda": "๐Ÿ‡บ๐Ÿ‡ฌ",
298
+ "Rwanda": "๐Ÿ‡ท๐Ÿ‡ผ",
299
+ "Burundi": "๐Ÿ‡ง๐Ÿ‡ฎ",
300
+ "Mozambique": "๐Ÿ‡ฒ๐Ÿ‡ฟ",
301
+ "Somalia": "๐Ÿ‡ธ๐Ÿ‡ด",
302
+ "Comoros": "๐Ÿ‡ฐ๐Ÿ‡ฒ",
303
+ }
304
+
305
+ # Create a string that contains just the flag emojis
306
+ flags_string = " ".join(COUNTRY_FLAGS.values())
307
+
308
+ # Add the world emoji at the beginning
309
+ full_string = f"๐ŸŒ {flags_string}"
310
+
311
+ with gr.Blocks(css=css) as demo:
312
+ gr.Markdown(
313
+ f"""
314
+ # {full_string} Swahili - Multilingual Prompt Evaluation Project
315
+ Hugging Face na @argilla wanatengeneza mradi wa Multilingual Prompt Evaluation Project (https://github.com/huggingface/data-is-better-together/tree/main/prompt_translation). Hii ni kipimo cha lugha nyingi kilichofunguliwa kwa ajili ya kutathmini mifano ya lugha, na hivyo kwa Kiswahili.
316
+
317
+
318
+ ## Lengo ni kutafsiri Maombi (Prompts) 500
319
+ Na kama ilivyo kawaida: data inahitajika kwa hilo! Jamii ilichagua maombi bora 500 ambayo yametengenza benchmark. Maombi hayo yapo kwa lugha ya Kiingereza.
320
+ **Ndio maana tunahitaji msaada wako**: ikiwa sisi sote tutatafsiri maombi 500, tunaweza kuongeza **Kiswahili** kwenye orodha ya lugha zilizotafsiriwa kwa ufasaha.
321
+
322
+ ## Jinsi ya kushiriki (Swahili)
323
+ Kushiriki ni rahisi. Nenda kwenye (https://huggingface.co/spaces/DIBT-Swahili/prompt-translation-for-Swahili), ingia au unda akaunti ya Hugging Face, na unaweza kuanza kufanya kazi.
324
+ Shukrani za mapema! Tumerahisisha ufanyaji kazi kwa kutasfiri kwa kutumia system mbalimbali ambazo hazina uhakika mkubwa wa kuwa na majibu sahihi.
325
+
326
+
327
+ ## How to participate
328
+ Participating is easy. Go to the (https://huggingface.co/spaces/DIBT-Swahili/prompt-translation-for-Swahili), log in or create a Hugging Face account, and you can start working.
329
+ Thanks in advance! Oh, and here's a little nudge: everything has been translated using various models, however, the translations may not be accurate.
330
+ """
331
+ )
332
+
333
+ gr.Markdown(
334
+ f"""
335
+ ## ๐Ÿš€ Maendeleo ya Sasa
336
+ Hapa ndiyo tumefikia mpaka sasa!
337
+ """
338
+ )
339
+ with gr.Row():
340
+
341
+ kpi_submitted_plot = gr.Plot(label="Plot")
342
+ demo.load(
343
+ kpi_chart_submitted,
344
+ inputs=[],
345
+ outputs=[kpi_submitted_plot],
346
+ )
347
+
348
+ kpi_remaining_plot = gr.Plot(label="Plot")
349
+ demo.load(
350
+ kpi_chart_remaining,
351
+ inputs=[],
352
+ outputs=[kpi_remaining_plot],
353
+ )
354
+
355
+ donut_total_plot = gr.Plot(label="Plot")
356
+ demo.load(
357
+ donut_chart_total,
358
+ inputs=[],
359
+ outputs=[donut_total_plot],
360
+ )
361
+
362
+ # gr.Markdown(
363
+ # """
364
+ # ## ๐Ÿ‘พ Hall of Fame
365
+ # Here you can see the top contributors and the number of annotations they have made.
366
+ # """
367
+ # )
368
+
369
+ gr.Markdown(
370
+ """
371
+ ## ๐Ÿ‘พ Ukumbi wa Umaarufu
372
+ Hapa unaweza kuona wachangiaji bora na idadi ya maoni waliyotoa.
373
+ """
374
+ )
375
+
376
+ with gr.Row():
377
+
378
+ kpi_hall_plot = gr.Plot(label="Plot")
379
+ demo.load(
380
+ kpi_chart_total_annotators, inputs=[], outputs=[kpi_hall_plot]
381
+ )
382
+
383
+ top_df_plot = gr.Dataframe(
384
+ headers=[NAME, NUMBER_ANNOTATIONS],
385
+ datatype=[
386
+ "markdown",
387
+ "number",
388
+ ],
389
+ row_count=50,
390
+ col_count=(2, "fixed"),
391
+ interactive=False,
392
+ )
393
+ demo.load(get_top, None, [top_df_plot])
394
+
395
+ # Launch the Gradio interface
396
+ demo.launch()
397
+
398
+
399
+ if __name__ == "__main__":
400
+ main()