Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
ayushi0430
commited on
Commit
•
a4a4c60
1
Parent(s):
924dee3
update readme
Browse files- adhoc.py +2 -3
- run-adhoc.sh +13 -1
- src/about.py +67 -21
- src/backend/custom_evaluator.py +1 -1
- src/backend/run_eval_suite.py +1 -0
- src/envs.py +3 -3
adhoc.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import asyncio
|
2 |
from src.backend.manage_requests import EvalRequest
|
3 |
-
from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE
|
4 |
from src.backend.run_eval_suite import run_evaluation
|
5 |
from src.about import HarnessTasks
|
6 |
|
@@ -25,8 +25,7 @@ async def run_adhoc_eval(eval_request: EvalRequest):
|
|
25 |
|
26 |
def main():
|
27 |
# eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
|
28 |
-
|
29 |
-
vals = {"model": model_name, "json_filepath": "", "base_model": "", "revision": "main",
|
30 |
"private": False,
|
31 |
"precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
|
32 |
"submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
|
|
|
1 |
import asyncio
|
2 |
from src.backend.manage_requests import EvalRequest
|
3 |
+
from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE, LOCAL_MODEL_NAME
|
4 |
from src.backend.run_eval_suite import run_evaluation
|
5 |
from src.about import HarnessTasks
|
6 |
|
|
|
25 |
|
26 |
def main():
|
27 |
# eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
|
28 |
+
vals = {"model": LOCAL_MODEL_NAME, "json_filepath": "", "base_model": "", "revision": "main",
|
|
|
29 |
"private": False,
|
30 |
"precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
|
31 |
"submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
|
run-adhoc.sh
CHANGED
@@ -11,5 +11,17 @@ set -Eeuoxa pipefail
|
|
11 |
|
12 |
LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
docker buildx build --platform=linux/amd64 -t adhoc .
|
15 |
-
docker run -it --rm --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data adhoc python adhoc.py
|
|
|
11 |
|
12 |
LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
13 |
|
14 |
+
for ARGUMENT in "$@"
|
15 |
+
do
|
16 |
+
KEY=$(echo $ARGUMENT | cut -f1 -d=)
|
17 |
+
|
18 |
+
KEY_LENGTH=${#KEY}
|
19 |
+
VALUE="${ARGUMENT:$KEY_LENGTH+1}"
|
20 |
+
|
21 |
+
export "$KEY"="$VALUE"
|
22 |
+
done
|
23 |
+
|
24 |
+
echo "Model passed is: $LOCAL_MODEL_NAME"
|
25 |
+
|
26 |
docker buildx build --platform=linux/amd64 -t adhoc .
|
27 |
+
docker run -it --rm --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME adhoc python adhoc.py
|
src/about.py
CHANGED
@@ -59,27 +59,40 @@ This leaderboard evaluates and benchmarks LLM hallucinations when answering ques
|
|
59 |
|
60 |
# Which evaluations are you running? how can people reproduce what you have?
|
61 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
## How it works
|
63 |
|
64 |
-
All the requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
|
|
|
|
|
65 |
|
66 |
-
We run the following evaluation
|
67 |
|
68 |
-
1.
|
69 |
-
|
70 |
-
|
71 |
|
72 |
### Custom benchmarks:
|
73 |
|
74 |
-
We introduce a few custom benchmarks
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
4.
|
77 |
-
5. **Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer's question. This evaluates how well the model learns and its recall abilities.
|
78 |
|
|
|
|
|
|
|
79 |
|
80 |
-
Task:
|
81 |
-
Given a product catalog dataset, we evaluate the models on its ability to remember the product information correctly and answer a customer's question based on the product information.
|
82 |
-
The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model.
|
83 |
|
84 |
Prompt used to generate product information:
|
85 |
```
|
@@ -91,10 +104,40 @@ The customer asks\n
|
|
91 |
<question>
|
92 |
```
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
We use Mistral(mistralai/Mistral-7B-Instruct-v0.2) to score the model answers.
|
96 |
|
97 |
-
|
98 |
```
|
99 |
Read this scoring rubric carefully and follow the instructions precisely:\n
|
100 |
A score of 5 means that model's id is the same as the gold answer's id.\n
|
@@ -110,7 +153,7 @@ gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, m
|
|
110 |
Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
|
111 |
```
|
112 |
|
113 |
-
|
114 |
```
|
115 |
A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
|
116 |
Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
|
@@ -132,25 +175,28 @@ f"How would you score the model's answer compared to the gold answer (using the
|
|
132 |
## Reproducibility
|
133 |
1. Clone this repo by following the steps provided [here.](https://huggingface.co/spaces/lamini/leaderboard?clone=true)
|
134 |
|
135 |
-
2. Now change the environment variables in `src/env
|
136 |
|
137 |
There are 2 ways to run this code locally:
|
138 |
|
139 |
1. To run the backend server + UI locally, run the following command:
|
140 |
```
|
141 |
-
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
|
142 |
|
143 |
Eg:
|
144 |
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
|
145 |
```
|
146 |
|
147 |
-
2. To run only the backend server locally
|
148 |
```
|
149 |
-
./run-adhoc.sh
|
150 |
```
|
151 |
|
|
|
|
|
152 |
"""
|
153 |
|
|
|
154 |
EVALUATION_QUEUE_TEXT = """
|
155 |
## Some good practices before submitting a model
|
156 |
|
@@ -185,13 +231,13 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
185 |
CITATION_BUTTON_TEXT = r"""
|
186 |
@dataset{Lamini,
|
187 |
author = {Ayushi Sharma},
|
188 |
-
title = {Lamini
|
189 |
year = {2024},
|
190 |
month = {03},
|
191 |
publisher = {Lamini},
|
192 |
doi = {},
|
193 |
url = {},
|
194 |
-
abstract = {
|
195 |
-
keywords = {llm, hallucination},
|
196 |
-
license = {
|
197 |
}"""
|
|
|
59 |
|
60 |
# Which evaluations are you running? how can people reproduce what you have?
|
61 |
LLM_BENCHMARKS_TEXT = f"""
|
62 |
+
## About the leaderboard
|
63 |
+
|
64 |
+
This leaderboard ranks different models (both pre-trained and finetuned) on their **Photographic Memory** as measured on the **Lamini Evaluation Suite**.
|
65 |
+
**Photographic Memory** means the ability of a model to recall and generate accurate information learnt during pre-training or finetuning.
|
66 |
+
The benchmarks are divided into two categories: standard benchmarks and custom benchmarks. The standard benchmarks evaluate the model's generation abilities on standard data, while the custom benchmarks evaluate the model's performance on domain-specific data.
|
67 |
+
|
68 |
## How it works
|
69 |
|
70 |
+
All the open-source models evaluation requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
|
71 |
+
|
72 |
+
### Standard benchmarks:
|
73 |
|
74 |
+
We run the following standard evaluation benchmarks to assess any regression from the LLM pretraining phase:
|
75 |
|
76 |
+
**1. MMLU (Global facts)**: Evaluates the model's ability to answer questions about global facts.
|
77 |
+
|
78 |
+
**2. Truthful QA**: Evaluates the model's ability to generate truthful answers.
|
79 |
|
80 |
### Custom benchmarks:
|
81 |
|
82 |
+
We also introduce a few custom benchmarks for domain specific data to assess precision and recall for numbers and exact figures. The same can be adapted to other domains.
|
83 |
+
|
84 |
+
### <ins>Domain 1: E-commerce</ins>
|
85 |
+
Given a product catalog dataset, we evaluate the model on its ability to remember the product information correctly and answer a customer's question.
|
86 |
+
The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model. We assign the following scores to the model output:
|
87 |
+
|
88 |
+
**3. Product Response Subjective Score**: Subjectively score and evaluates the model's ability to answer a customer's question. The answer is judged on proximity to truth and relevance.
|
89 |
|
90 |
+
**4. Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer enquiring about a product. This evaluates how well the model learns and can recall the exact figures learnt during training.
|
|
|
91 |
|
92 |
+
The evaluation dataset for e-commerce domain is available [here](https://github.com/lamini-ai/leaderboard/blob/main/src/datasets/shopping.jsonl).
|
93 |
+
|
94 |
+
The data for finetuning models was created using the [Lamini generation pipeline](https://github.com/lamini-ai/instacart-greg/blob/main/instacart/generate_data_pipeline.py) and is available [here](https://huggingface.co/datasets/lamini/product-catalog-questions).
|
95 |
|
|
|
|
|
|
|
96 |
|
97 |
Prompt used to generate product information:
|
98 |
```
|
|
|
104 |
<question>
|
105 |
```
|
106 |
|
107 |
+
### <ins>Domain 2: Medicine</ins>
|
108 |
+
|
109 |
+
Here we mimic a common problem in the medical domain - accurately answering questions about the ICD-11 standard. ICD-11 (International Classification of Diseases 11th Revision) is a globally used standard for classifying diseases, health conditions, and related phenomena.
|
110 |
+
It is maintained by the World Health Organization (WHO) and serves as a common language for health information systems, epidemiology, health statistics, clinical care, and research.
|
111 |
+
|
112 |
+
**5. ICD Code Precision Score**: When a model answers a question about an ICD11 standard, this metric evaluates the accuracy of the ICD11 code generated. The standard is pretty stringent so it is important to get the right code.
|
113 |
+
|
114 |
+
**6. ICD Code Subjective Score**: This metric evaluates the overall quality of the answers in terms of correctness and completeness, and assigns a score to it.
|
115 |
+
|
116 |
+
|
117 |
+
The evaluation dataset for medical domain is available [here](https://github.com/lamini-ai/leaderboard/blob/main/src/datasets/icd11.jsonl).
|
118 |
+
|
119 |
+
The data for finetuning models was created using the [Lamini generation pipeline](https://github.com/lamini-ai/lamini-sdk/blob/greg.cpt-gpt/04_IFT/generate_data_pipeline.py) and is available [here](https://huggingface.co/datasets/lamini/icd-11-qa).
|
120 |
+
|
121 |
+
|
122 |
+
### <ins>Domain 3: Finance</ins>
|
123 |
+
|
124 |
+
Here we simulate a problem in the finance field - accurately answering questions about companies' financial performance based on the transcripts of their earnings calls.
|
125 |
+
|
126 |
+
**7. Earnings Value Precision Score**: While answering the question about companies financials, the model might output some values like 800M $. This score evaluates the accuracy of the value and the units generated based on the transcripts of the calls aka the training data.
|
127 |
+
|
128 |
+
**8. Earnings Value Subjective Score:**: This metric assigns a score to the overall quality of the answers in terms of correctness and coherence.
|
129 |
+
|
130 |
+
|
131 |
+
The evaluation dataset for finance domain is available [here](https://github.com/lamini-ai/leaderboard/blob/main/src/datasets/earnings_calls.jsonl).
|
132 |
+
|
133 |
+
The data for finetuning models was created using the [Lamini generation pipeline](https://github.com/lamini-ai/lamini-earnings-calls) and is available [here](https://huggingface.co/datasets/lamini/earnings-calls-qa).
|
134 |
+
|
135 |
+
|
136 |
+
### Scoring
|
137 |
|
138 |
We use Mistral(mistralai/Mistral-7B-Instruct-v0.2) to score the model answers.
|
139 |
|
140 |
+
Sample scoring rubric for the e-commerce domain. Rest of the domain rubrics are similar to this:
|
141 |
```
|
142 |
Read this scoring rubric carefully and follow the instructions precisely:\n
|
143 |
A score of 5 means that model's id is the same as the gold answer's id.\n
|
|
|
153 |
Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
|
154 |
```
|
155 |
|
156 |
+
Sample prompt used to score the model answers. Rest of the domain prompts are similar to this:
|
157 |
```
|
158 |
A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
|
159 |
Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
|
|
|
175 |
## Reproducibility
|
176 |
1. Clone this repo by following the steps provided [here.](https://huggingface.co/spaces/lamini/leaderboard?clone=true)
|
177 |
|
178 |
+
2. Now change the environment variables in `src/env` to add your api keys.
|
179 |
|
180 |
There are 2 ways to run this code locally:
|
181 |
|
182 |
1. To run the backend server + UI locally, run the following command:
|
183 |
```
|
184 |
+
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface or openai model you want to try>"
|
185 |
|
186 |
Eg:
|
187 |
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
|
188 |
```
|
189 |
|
190 |
+
2. To run only the backend server locally to quickly get evaluation results, run the following command:
|
191 |
```
|
192 |
+
./run-adhoc.sh LOCAL_MODEL_NAME="<any huggingface or openai model you want to try>"
|
193 |
```
|
194 |
|
195 |
+
This will still allow you to save your benchmarking results in the folder `output-data`.
|
196 |
+
|
197 |
"""
|
198 |
|
199 |
+
# We don't allow model submission right now.
|
200 |
EVALUATION_QUEUE_TEXT = """
|
201 |
## Some good practices before submitting a model
|
202 |
|
|
|
231 |
CITATION_BUTTON_TEXT = r"""
|
232 |
@dataset{Lamini,
|
233 |
author = {Ayushi Sharma},
|
234 |
+
title = {Lamini Photographic Memory Leaderboard},
|
235 |
year = {2024},
|
236 |
month = {03},
|
237 |
publisher = {Lamini},
|
238 |
doi = {},
|
239 |
url = {},
|
240 |
+
abstract = {},
|
241 |
+
keywords = {llm, hallucination, photographic memory},
|
242 |
+
license = {cc-by-4.0},
|
243 |
}"""
|
src/backend/custom_evaluator.py
CHANGED
@@ -45,7 +45,7 @@ class CustomEvaluator:
|
|
45 |
dict: A dictionary containing evaluation results.
|
46 |
"""
|
47 |
try:
|
48 |
-
if "
|
49 |
model_type = "gpt4"
|
50 |
else:
|
51 |
model_type = "lamini"
|
|
|
45 |
dict: A dictionary containing evaluation results.
|
46 |
"""
|
47 |
try:
|
48 |
+
if "gpt4" in self.model:
|
49 |
model_type = "gpt4"
|
50 |
else:
|
51 |
model_type = "lamini"
|
src/backend/run_eval_suite.py
CHANGED
@@ -19,6 +19,7 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
|
|
19 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
20 |
)
|
21 |
|
|
|
22 |
custom_evaluator = CustomEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
|
23 |
batch_size, device, no_cache, limit, write_out=True,
|
24 |
output_base_path='logs')
|
|
|
19 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
20 |
)
|
21 |
|
22 |
+
print(f"\nRunning eval_request: {eval_request}\n")
|
23 |
custom_evaluator = CustomEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
|
24 |
batch_size, device, no_cache, limit, write_out=True,
|
25 |
output_base_path='logs')
|
src/envs.py
CHANGED
@@ -6,16 +6,16 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN")
|
8 |
RUN_MODE = os.environ.get("RUN_MODE")
|
9 |
-
LOCAL_MODEL_NAME = os.environ.get("LOCAL_MODEL_NAME")
|
10 |
|
11 |
LAMINI_ENV = os.environ.get("LAMINI_ENV", "PRODUCTION")
|
12 |
LAMINI_API_KEY = os.environ.get("LAMINI_API_KEY")
|
13 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
14 |
-
MAX_EXAMPLES = os.environ.get("MAX_EXAMPLES",
|
|
|
15 |
|
16 |
OWNER = "lamini" # Change to your org - don't forget to create a results and request file
|
17 |
DEVICE = "cpu" # "cuda:0" if you add compute
|
18 |
-
LIMIT = os.environ.get("MAX_EXAMPLES")
|
19 |
# ----------------------------------
|
20 |
|
21 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN")
|
8 |
RUN_MODE = os.environ.get("RUN_MODE")
|
9 |
+
LOCAL_MODEL_NAME = os.environ.get("LOCAL_MODEL_NAME", "hf-internal-testing/tiny-random-gpt2")
|
10 |
|
11 |
LAMINI_ENV = os.environ.get("LAMINI_ENV", "PRODUCTION")
|
12 |
LAMINI_API_KEY = os.environ.get("LAMINI_API_KEY")
|
13 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
14 |
+
MAX_EXAMPLES = os.environ.get("MAX_EXAMPLES", 2)
|
15 |
+
LIMIT = os.environ.get("MAX_EXAMPLES", 2)
|
16 |
|
17 |
OWNER = "lamini" # Change to your org - don't forget to create a results and request file
|
18 |
DEVICE = "cpu" # "cuda:0" if you add compute
|
|
|
19 |
# ----------------------------------
|
20 |
|
21 |
REPO_ID = f"{OWNER}/leaderboard"
|