Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update display
Browse files- src/display/about.py +29 -5
src/display/about.py
CHANGED
@@ -11,24 +11,48 @@ class Task:
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
-
task0 = Task("
|
15 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
# Your leaderboard name
|
19 |
-
TITLE = """<h1 align="center" id="space-title">
|
20 |
|
21 |
# What does your leaderboard evaluate?
|
22 |
INTRODUCTION_TEXT = """
|
23 |
-
|
24 |
"""
|
25 |
|
26 |
# Which evaluations are you running? how can people reproduce what you have?
|
27 |
LLM_BENCHMARKS_TEXT = f"""
|
28 |
## How it works
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
## Reproducibility
|
31 |
-
|
|
|
|
|
|
|
|
|
32 |
|
33 |
"""
|
34 |
|
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
+
task0 = Task("finance_bench", "accuracy", "FinanceBench")
|
15 |
+
task1 = Task("legal_confidentiality", "accuracy", "Legal Confidentiality")
|
16 |
+
task2 = Task("writing-prompts", "coherence", "Writing Prompts")
|
17 |
+
task3 = Task("customer-support", "engagement", "Customer Support Dialogue")
|
18 |
+
task4 = Task("toxic-prompts", "toxicity", "Toxic Prompts")
|
19 |
+
task5 = Task("enterprise-pii", "accuracy", "Enterprise PII")
|
20 |
|
21 |
|
22 |
# Your leaderboard name
|
23 |
+
TITLE = """<h1 align="center" id="space-title">Patronus AI leaderboard</h1>"""
|
24 |
|
25 |
# What does your leaderboard evaluate?
|
26 |
INTRODUCTION_TEXT = """
|
27 |
+
This leaderboard evaluates the performance of models on real-world enterprise use cases.
|
28 |
"""
|
29 |
|
30 |
# Which evaluations are you running? how can people reproduce what you have?
|
31 |
LLM_BENCHMARKS_TEXT = f"""
|
32 |
## How it works
|
33 |
|
34 |
+
## Tasks
|
35 |
+
1. FinanceBench: The task measures the ability to answer financial questions given the context.
|
36 |
+
|
37 |
+
2. Legal Confidentiality: The task measures the ability of LLMs to reason over legal causes. The model is prompted
|
38 |
+
to return yes/no as an answer to the question.
|
39 |
+
|
40 |
+
3. Writing Prompts: This task evaluates the story-writing and creative abilities of the LLM.
|
41 |
+
|
42 |
+
4. Customer Support Dialogue: This task evaluates the ability of the LLM to answer a customer support question
|
43 |
+
given some product information and conversational history.
|
44 |
+
|
45 |
+
5. Toxic Prompts: This task evaluates the safety of the model by using prompts that can elicit harmful information
|
46 |
+
from LLMs.
|
47 |
+
|
48 |
+
6. Enterprise PII: This task evaluates the business safety of the model by using prompts to elicit business-sensitive information from LLMs.
|
49 |
+
|
50 |
## Reproducibility
|
51 |
+
All of our datasets are closed-source. We provide a validation set with 5 examples for each of the tasks.
|
52 |
+
|
53 |
+
To reproduce the results on the validation set, run:
|
54 |
+
|
55 |
+
|
56 |
|
57 |
"""
|
58 |
|