Spaces:

PatronusAI
/

enterprise_scenarios_leaderboard

Running on CPU Upgrade

App Files Files Community

sunitha98 commited on Dec 28, 2023

Commit

6f72df1

•

1 Parent(s): 943f952

update display

Browse files

Files changed (1) hide show

src/display/about.py +29 -5

src/display/about.py CHANGED Viewed

@@ -11,24 +11,48 @@ class Task:
 # Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("task_name1", "metric_name", "First task")
-    task1 = Task("task_name2", "metric_name", "Second task")
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
-To reproduce our results, here is the commands you can run:
 """

 # Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("finance_bench", "accuracy", "FinanceBench")
+    task1 = Task("legal_confidentiality", "accuracy", "Legal Confidentiality")
+    task2 = Task("writing-prompts", "coherence", "Writing Prompts")
+    task3 = Task("customer-support", "engagement", "Customer Support Dialogue")
+    task4 = Task("toxic-prompts", "toxicity", "Toxic Prompts")
+    task5 = Task("enterprise-pii", "accuracy", "Enterprise PII")
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Patronus AI leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+This leaderboard evaluates the performance of models on real-world enterprise use cases.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+## Tasks
+1. FinanceBench: The task measures the ability to answer financial questions given the context.
+2. Legal Confidentiality: The task measures the ability of LLMs to reason over legal causes. The model is prompted
+to return yes/no as an answer to the question.
+3. Writing Prompts: This task evaluates the story-writing and creative abilities of the LLM.
+4. Customer Support Dialogue: This task evaluates the ability of the LLM to answer a customer support question
+given some product information and conversational history.
+5. Toxic Prompts: This task evaluates the safety of the model by using prompts that can elicit harmful information
+from LLMs.
+6. Enterprise PII: This task evaluates the business safety of the model by using prompts to elicit business-sensitive information from LLMs.
 ## Reproducibility
+All of our datasets are closed-source. We provide a validation set with 5 examples for each of the tasks.
+To reproduce the results on the validation set, run:
 """