ayushi0430 commited on
Commit
a4a4c60
1 Parent(s): 924dee3

update readme

Browse files
adhoc.py CHANGED
@@ -1,6 +1,6 @@
1
  import asyncio
2
  from src.backend.manage_requests import EvalRequest
3
- from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE
4
  from src.backend.run_eval_suite import run_evaluation
5
  from src.about import HarnessTasks
6
 
@@ -25,8 +25,7 @@ async def run_adhoc_eval(eval_request: EvalRequest):
25
 
26
  def main():
27
  # eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
28
- model_name = "2cce1211d4e3aa176d5688ab38b85f46b41be221f856f4171e6e42b55864f3f3"
29
- vals = {"model": model_name, "json_filepath": "", "base_model": "", "revision": "main",
30
  "private": False,
31
  "precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
32
  "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
 
1
  import asyncio
2
  from src.backend.manage_requests import EvalRequest
3
+ from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE, LOCAL_MODEL_NAME
4
  from src.backend.run_eval_suite import run_evaluation
5
  from src.about import HarnessTasks
6
 
 
25
 
26
  def main():
27
  # eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
28
+ vals = {"model": LOCAL_MODEL_NAME, "json_filepath": "", "base_model": "", "revision": "main",
 
29
  "private": False,
30
  "precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
31
  "submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
run-adhoc.sh CHANGED
@@ -11,5 +11,17 @@ set -Eeuoxa pipefail
11
 
12
  LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  docker buildx build --platform=linux/amd64 -t adhoc .
15
- docker run -it --rm --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data adhoc python adhoc.py
 
11
 
12
  LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
13
 
14
+ for ARGUMENT in "$@"
15
+ do
16
+ KEY=$(echo $ARGUMENT | cut -f1 -d=)
17
+
18
+ KEY_LENGTH=${#KEY}
19
+ VALUE="${ARGUMENT:$KEY_LENGTH+1}"
20
+
21
+ export "$KEY"="$VALUE"
22
+ done
23
+
24
+ echo "Model passed is: $LOCAL_MODEL_NAME"
25
+
26
  docker buildx build --platform=linux/amd64 -t adhoc .
27
+ docker run -it --rm --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME adhoc python adhoc.py
src/about.py CHANGED
@@ -59,27 +59,40 @@ This leaderboard evaluates and benchmarks LLM hallucinations when answering ques
59
 
60
  # Which evaluations are you running? how can people reproduce what you have?
61
  LLM_BENCHMARKS_TEXT = f"""
 
 
 
 
 
 
62
  ## How it works
63
 
64
- All the requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
 
 
65
 
66
- We run the following evaluation metrics:
67
 
68
- 1. **MMLU (Global facts)**: Evaluates the model's ability to answer questions about global facts.
69
- 2. **MMLU (Formal logic)**: Evaluates the model's ability to answer questions about formal logic.
70
- 3. **Truthful QA**: Evaluates the model's ability to generate truthful answers.
71
 
72
  ### Custom benchmarks:
73
 
74
- We introduce a few custom benchmarks run on an open-source product catalog dataset.
 
 
 
 
 
 
75
 
76
- 4. **Subjective Response Score**: Subjectively score and evaluates the model's ability to answer a customer's question about a product.
77
- 5. **Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer's question. This evaluates how well the model learns and its recall abilities.
78
 
 
 
 
79
 
80
- Task:
81
- Given a product catalog dataset, we evaluate the models on its ability to remember the product information correctly and answer a customer's question based on the product information.
82
- The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model.
83
 
84
  Prompt used to generate product information:
85
  ```
@@ -91,10 +104,40 @@ The customer asks\n
91
  <question>
92
  ```
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  We use Mistral(mistralai/Mistral-7B-Instruct-v0.2) to score the model answers.
96
 
97
- Prompt to get the scoring rubric:
98
  ```
99
  Read this scoring rubric carefully and follow the instructions precisely:\n
100
  A score of 5 means that model's id is the same as the gold answer's id.\n
@@ -110,7 +153,7 @@ gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, m
110
  Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
111
  ```
112
 
113
- Prompt used to score the model answers:
114
  ```
115
  A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
116
  Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
@@ -132,25 +175,28 @@ f"How would you score the model's answer compared to the gold answer (using the
132
  ## Reproducibility
133
  1. Clone this repo by following the steps provided [here.](https://huggingface.co/spaces/lamini/leaderboard?clone=true)
134
 
135
- 2. Now change the environment variables in `src/env`.
136
 
137
  There are 2 ways to run this code locally:
138
 
139
  1. To run the backend server + UI locally, run the following command:
140
  ```
141
- ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
142
 
143
  Eg:
144
  ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
145
  ```
146
 
147
- 2. To run only the backend server locally, make changes to adhoc.py to specify your model name and run the following command:
148
  ```
149
- ./run-adhoc.sh
150
  ```
151
 
 
 
152
  """
153
 
 
154
  EVALUATION_QUEUE_TEXT = """
155
  ## Some good practices before submitting a model
156
 
@@ -185,13 +231,13 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
185
  CITATION_BUTTON_TEXT = r"""
186
  @dataset{Lamini,
187
  author = {Ayushi Sharma},
188
- title = {Lamini Anti-hallucination Leaderboard},
189
  year = {2024},
190
  month = {03},
191
  publisher = {Lamini},
192
  doi = {},
193
  url = {},
194
- abstract = {This leaderboard evaluates and benchmarks LLM hallucinations when answering questions. Models may be pre-trained or finetuned.},
195
- keywords = {llm, hallucination},
196
- license = {Apache-2.0},
197
  }"""
 
59
 
60
  # Which evaluations are you running? how can people reproduce what you have?
61
  LLM_BENCHMARKS_TEXT = f"""
62
+ ## About the leaderboard
63
+
64
+ This leaderboard ranks different models (both pre-trained and finetuned) on their **Photographic Memory** as measured on the **Lamini Evaluation Suite**.
65
+ **Photographic Memory** means the ability of a model to recall and generate accurate information learnt during pre-training or finetuning.
66
+ The benchmarks are divided into two categories: standard benchmarks and custom benchmarks. The standard benchmarks evaluate the model's generation abilities on standard data, while the custom benchmarks evaluate the model's performance on domain-specific data.
67
+
68
  ## How it works
69
 
70
+ All the open-source models evaluation requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
71
+
72
+ ### Standard benchmarks:
73
 
74
+ We run the following standard evaluation benchmarks to assess any regression from the LLM pretraining phase:
75
 
76
+ **1. MMLU (Global facts)**: Evaluates the model's ability to answer questions about global facts.
77
+
78
+ **2. Truthful QA**: Evaluates the model's ability to generate truthful answers.
79
 
80
  ### Custom benchmarks:
81
 
82
+ We also introduce a few custom benchmarks for domain specific data to assess precision and recall for numbers and exact figures. The same can be adapted to other domains.
83
+
84
+ ### <ins>Domain 1: E-commerce</ins>
85
+ Given a product catalog dataset, we evaluate the model on its ability to remember the product information correctly and answer a customer's question.
86
+ The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model. We assign the following scores to the model output:
87
+
88
+ **3. Product Response Subjective Score**: Subjectively score and evaluates the model's ability to answer a customer's question. The answer is judged on proximity to truth and relevance.
89
 
90
+ **4. Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer enquiring about a product. This evaluates how well the model learns and can recall the exact figures learnt during training.
 
91
 
92
+ The evaluation dataset for e-commerce domain is available [here](https://github.com/lamini-ai/leaderboard/blob/main/src/datasets/shopping.jsonl).
93
+
94
+ The data for finetuning models was created using the [Lamini generation pipeline](https://github.com/lamini-ai/instacart-greg/blob/main/instacart/generate_data_pipeline.py) and is available [here](https://huggingface.co/datasets/lamini/product-catalog-questions).
95
 
 
 
 
96
 
97
  Prompt used to generate product information:
98
  ```
 
104
  <question>
105
  ```
106
 
107
+ ### <ins>Domain 2: Medicine</ins>
108
+
109
+ Here we mimic a common problem in the medical domain - accurately answering questions about the ICD-11 standard. ICD-11 (International Classification of Diseases 11th Revision) is a globally used standard for classifying diseases, health conditions, and related phenomena.
110
+ It is maintained by the World Health Organization (WHO) and serves as a common language for health information systems, epidemiology, health statistics, clinical care, and research.
111
+
112
+ **5. ICD Code Precision Score**: When a model answers a question about an ICD11 standard, this metric evaluates the accuracy of the ICD11 code generated. The standard is pretty stringent so it is important to get the right code.
113
+
114
+ **6. ICD Code Subjective Score**: This metric evaluates the overall quality of the answers in terms of correctness and completeness, and assigns a score to it.
115
+
116
+
117
+ The evaluation dataset for medical domain is available [here](https://github.com/lamini-ai/leaderboard/blob/main/src/datasets/icd11.jsonl).
118
+
119
+ The data for finetuning models was created using the [Lamini generation pipeline](https://github.com/lamini-ai/lamini-sdk/blob/greg.cpt-gpt/04_IFT/generate_data_pipeline.py) and is available [here](https://huggingface.co/datasets/lamini/icd-11-qa).
120
+
121
+
122
+ ### <ins>Domain 3: Finance</ins>
123
+
124
+ Here we simulate a problem in the finance field - accurately answering questions about companies' financial performance based on the transcripts of their earnings calls.
125
+
126
+ **7. Earnings Value Precision Score**: While answering the question about companies financials, the model might output some values like 800M $. This score evaluates the accuracy of the value and the units generated based on the transcripts of the calls aka the training data.
127
+
128
+ **8. Earnings Value Subjective Score:**: This metric assigns a score to the overall quality of the answers in terms of correctness and coherence.
129
+
130
+
131
+ The evaluation dataset for finance domain is available [here](https://github.com/lamini-ai/leaderboard/blob/main/src/datasets/earnings_calls.jsonl).
132
+
133
+ The data for finetuning models was created using the [Lamini generation pipeline](https://github.com/lamini-ai/lamini-earnings-calls) and is available [here](https://huggingface.co/datasets/lamini/earnings-calls-qa).
134
+
135
+
136
+ ### Scoring
137
 
138
  We use Mistral(mistralai/Mistral-7B-Instruct-v0.2) to score the model answers.
139
 
140
+ Sample scoring rubric for the e-commerce domain. Rest of the domain rubrics are similar to this:
141
  ```
142
  Read this scoring rubric carefully and follow the instructions precisely:\n
143
  A score of 5 means that model's id is the same as the gold answer's id.\n
 
153
  Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
154
  ```
155
 
156
+ Sample prompt used to score the model answers. Rest of the domain prompts are similar to this:
157
  ```
158
  A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
159
  Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
 
175
  ## Reproducibility
176
  1. Clone this repo by following the steps provided [here.](https://huggingface.co/spaces/lamini/leaderboard?clone=true)
177
 
178
+ 2. Now change the environment variables in `src/env` to add your api keys.
179
 
180
  There are 2 ways to run this code locally:
181
 
182
  1. To run the backend server + UI locally, run the following command:
183
  ```
184
+ ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface or openai model you want to try>"
185
 
186
  Eg:
187
  ./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
188
  ```
189
 
190
+ 2. To run only the backend server locally to quickly get evaluation results, run the following command:
191
  ```
192
+ ./run-adhoc.sh LOCAL_MODEL_NAME="<any huggingface or openai model you want to try>"
193
  ```
194
 
195
+ This will still allow you to save your benchmarking results in the folder `output-data`.
196
+
197
  """
198
 
199
+ # We don't allow model submission right now.
200
  EVALUATION_QUEUE_TEXT = """
201
  ## Some good practices before submitting a model
202
 
 
231
  CITATION_BUTTON_TEXT = r"""
232
  @dataset{Lamini,
233
  author = {Ayushi Sharma},
234
+ title = {Lamini Photographic Memory Leaderboard},
235
  year = {2024},
236
  month = {03},
237
  publisher = {Lamini},
238
  doi = {},
239
  url = {},
240
+ abstract = {},
241
+ keywords = {llm, hallucination, photographic memory},
242
+ license = {cc-by-4.0},
243
  }"""
src/backend/custom_evaluator.py CHANGED
@@ -45,7 +45,7 @@ class CustomEvaluator:
45
  dict: A dictionary containing evaluation results.
46
  """
47
  try:
48
- if "gpt" in self.model:
49
  model_type = "gpt4"
50
  else:
51
  model_type = "lamini"
 
45
  dict: A dictionary containing evaluation results.
46
  """
47
  try:
48
+ if "gpt4" in self.model:
49
  model_type = "gpt4"
50
  else:
51
  model_type = "lamini"
src/backend/run_eval_suite.py CHANGED
@@ -19,6 +19,7 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
19
  "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
20
  )
21
 
 
22
  custom_evaluator = CustomEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
23
  batch_size, device, no_cache, limit, write_out=True,
24
  output_base_path='logs')
 
19
  "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
20
  )
21
 
22
+ print(f"\nRunning eval_request: {eval_request}\n")
23
  custom_evaluator = CustomEvaluator(eval_request.model, eval_request.revision, eval_request.precision,
24
  batch_size, device, no_cache, limit, write_out=True,
25
  output_base_path='logs')
src/envs.py CHANGED
@@ -6,16 +6,16 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN")
8
  RUN_MODE = os.environ.get("RUN_MODE")
9
- LOCAL_MODEL_NAME = os.environ.get("LOCAL_MODEL_NAME")
10
 
11
  LAMINI_ENV = os.environ.get("LAMINI_ENV", "PRODUCTION")
12
  LAMINI_API_KEY = os.environ.get("LAMINI_API_KEY")
13
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
- MAX_EXAMPLES = os.environ.get("MAX_EXAMPLES", 10)
 
15
 
16
  OWNER = "lamini" # Change to your org - don't forget to create a results and request file
17
  DEVICE = "cpu" # "cuda:0" if you add compute
18
- LIMIT = os.environ.get("MAX_EXAMPLES")
19
  # ----------------------------------
20
 
21
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN")
8
  RUN_MODE = os.environ.get("RUN_MODE")
9
+ LOCAL_MODEL_NAME = os.environ.get("LOCAL_MODEL_NAME", "hf-internal-testing/tiny-random-gpt2")
10
 
11
  LAMINI_ENV = os.environ.get("LAMINI_ENV", "PRODUCTION")
12
  LAMINI_API_KEY = os.environ.get("LAMINI_API_KEY")
13
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
14
+ MAX_EXAMPLES = os.environ.get("MAX_EXAMPLES", 2)
15
+ LIMIT = os.environ.get("MAX_EXAMPLES", 2)
16
 
17
  OWNER = "lamini" # Change to your org - don't forget to create a results and request file
18
  DEVICE = "cpu" # "cuda:0" if you add compute
 
19
  # ----------------------------------
20
 
21
  REPO_ID = f"{OWNER}/leaderboard"