Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
ayushi0430
commited on
Commit
•
283ff70
1
Parent(s):
42030b2
cleanup
Browse files- Dockerfile +0 -4
- adhoc.py +19 -42
- main_backend.py +2 -2
- output-data/test.txt +0 -0
- src/about.py +86 -4
- src/backend/lamini_eval/evaluators/ecommerce_evaluator.py +7 -53
- src/backend/lamini_eval/evaluators/utils/ecommerce_utils.py +52 -0
- src/backend/lamini_eval/models/lamini_model.py +39 -26
- src/backend/run_eval_suite.py +4 -4
Dockerfile
CHANGED
@@ -12,12 +12,8 @@ COPY src /code/src
|
|
12 |
COPY app.py /code/app.py
|
13 |
COPY main_backend.py /code/main_backend.py
|
14 |
|
15 |
-
# to generate adhoc result files --> to be deleted once generation pipeline is stable
|
16 |
COPY adhoc.py /code/adhoc.py
|
17 |
|
18 |
-
# answer and score generation pipeline
|
19 |
-
COPY generation.py /code/generation.py
|
20 |
-
|
21 |
|
22 |
EXPOSE 7860
|
23 |
|
|
|
12 |
COPY app.py /code/app.py
|
13 |
COPY main_backend.py /code/main_backend.py
|
14 |
|
|
|
15 |
COPY adhoc.py /code/adhoc.py
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
EXPOSE 7860
|
19 |
|
adhoc.py
CHANGED
@@ -1,54 +1,31 @@
|
|
1 |
import asyncio
|
2 |
-
import json
|
3 |
-
|
4 |
-
from src.backend.harness_evaluator import HarnessEvaluator
|
5 |
-
from src.backend.lamini_evaluator import LaminiEvaluator
|
6 |
from src.backend.manage_requests import EvalRequest
|
7 |
-
from src.envs import LIMIT
|
|
|
|
|
8 |
|
9 |
|
10 |
async def run_adhoc_eval(eval_request: EvalRequest):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
results = evaluator.evaluate(task_names)
|
27 |
-
print("harness results:", results)
|
28 |
-
|
29 |
-
results_trimmed = {
|
30 |
-
"config": results["config"],
|
31 |
-
"results": {
|
32 |
-
"mmlu_flan_n_shot_generative_global_facts": results["results"]["mmlu_flan_n_shot_generative_global_facts"],
|
33 |
-
"mmlu_flan_n_shot_generative_formal_logic": results["results"]["mmlu_flan_n_shot_generative_formal_logic"],
|
34 |
-
"truthfulqa_gen": results["results"]["truthfulqa_gen"],
|
35 |
-
"response_subjective_score": lamini_results["results"]["response_subjective_score"],
|
36 |
-
"product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
|
37 |
-
}
|
38 |
-
}
|
39 |
-
results_trimmed["config"]["model_dtype"] = eval_request.precision
|
40 |
-
results_trimmed["config"]["model_name"] = eval_request.model
|
41 |
-
results_trimmed["config"]["model_sha"] = eval_request.revision
|
42 |
-
|
43 |
-
output = json.dumps(results_trimmed, indent=4)
|
44 |
-
print("output:", output)
|
45 |
-
|
46 |
-
return output
|
47 |
|
48 |
|
49 |
def main():
|
50 |
# eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
|
51 |
-
vals = {"model": "
|
52 |
"private": False,
|
53 |
"precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
|
54 |
"submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
|
|
|
1 |
import asyncio
|
|
|
|
|
|
|
|
|
2 |
from src.backend.manage_requests import EvalRequest
|
3 |
+
from src.envs import LIMIT, EVAL_RESULTS_PATH_BACKEND, RESULTS_REPO, DEVICE
|
4 |
+
from src.backend.run_eval_suite import run_evaluation
|
5 |
+
from src.about import HarnessTasks
|
6 |
|
7 |
|
8 |
async def run_adhoc_eval(eval_request: EvalRequest):
|
9 |
+
# This job runs lamini tasks and harness tasks
|
10 |
+
|
11 |
+
TASKS_HARNESS = [task.value.benchmark for task in HarnessTasks]
|
12 |
+
|
13 |
+
await run_evaluation(
|
14 |
+
eval_request=eval_request,
|
15 |
+
task_names=TASKS_HARNESS,
|
16 |
+
num_fewshot=0,
|
17 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
18 |
+
results_repo=RESULTS_REPO,
|
19 |
+
batch_size=1,
|
20 |
+
device=DEVICE,
|
21 |
+
no_cache=True,
|
22 |
+
limit=LIMIT
|
23 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def main():
|
27 |
# eval_request: EvalRequest(model='meta-llama/Llama-2-7b-chat-hf', private=False, status='FINISHED', json_filepath='', weight_type='Original', model_type='\ud83d\udfe2 : pretrained', precision='bfloat16', base_model='', revision='main', submitted_time='2023-11-21T18:10:08Z', likes=0, params=0.1, license='custom')
|
28 |
+
vals = {"model": "hf-internal-testing/tiny-random-gpt2", "json_filepath": "", "base_model": "", "revision": "main",
|
29 |
"private": False,
|
30 |
"precision": "bfloat16", "weight_type": "Original", "status": "PENDING",
|
31 |
"submitted_time": "2023-11-21T18:10:08Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0,
|
main_backend.py
CHANGED
@@ -15,10 +15,10 @@ from src.backend.sort_queue import sort_models_by_priority
|
|
15 |
|
16 |
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, \
|
17 |
LIMIT, TOKEN, RUN_MODE
|
18 |
-
from src.about import
|
19 |
import asyncio
|
20 |
|
21 |
-
TASKS_HARNESS = [task.value.benchmark for task in
|
22 |
|
23 |
logging.basicConfig(level=logging.ERROR)
|
24 |
pp = pprint.PrettyPrinter(width=80)
|
|
|
15 |
|
16 |
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, \
|
17 |
LIMIT, TOKEN, RUN_MODE
|
18 |
+
from src.about import NUM_FEWSHOT, HarnessTasks
|
19 |
import asyncio
|
20 |
|
21 |
+
TASKS_HARNESS = [task.value.benchmark for task in HarnessTasks]
|
22 |
|
23 |
logging.basicConfig(level=logging.ERROR)
|
24 |
pp = pprint.PrettyPrinter(width=80)
|
output-data/test.txt
ADDED
File without changes
|
src/about.py
CHANGED
@@ -8,6 +8,12 @@ class Task:
|
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
@@ -20,6 +26,7 @@ class Tasks(Enum):
|
|
20 |
mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
|
21 |
"exact_match,strict-match", "MMLU (Formal logic)")
|
22 |
truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
|
|
|
23 |
response_subjective_score = Task("response_subjective_score", "response_subjective_score",
|
24 |
"Subjective Response Score")
|
25 |
product_id_precision_score = Task("product_id_precision_score", "product_id_precision_score",
|
@@ -42,19 +49,94 @@ This leaderboard evaluates and benchmarks LLM hallucinations when answering ques
|
|
42 |
LLM_BENCHMARKS_TEXT = f"""
|
43 |
## How it works
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
## Reproducibility
|
46 |
-
To run this code locally:
|
47 |
|
48 |
-
|
|
|
|
|
49 |
|
50 |
-
|
51 |
```
|
52 |
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
|
53 |
|
54 |
Eg:
|
55 |
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
|
56 |
```
|
57 |
-
This does not upload/download any requests/results files.
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
"""
|
60 |
|
|
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
|
11 |
+
class HarnessTasks(Enum):
|
12 |
+
mmlu_flan_n_shot_generative_global_facts = Task("mmlu_flan_n_shot_generative_global_facts",
|
13 |
+
"exact_match,strict-match", "MMLU (Global facts)")
|
14 |
+
mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
|
15 |
+
"exact_match,strict-match", "MMLU (Formal logic)")
|
16 |
+
truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
|
17 |
|
18 |
# Select your tasks here
|
19 |
# ---------------------------------------------------
|
|
|
26 |
mmlu_flan_n_shot_generative_formal_logic = Task("mmlu_flan_n_shot_generative_formal_logic",
|
27 |
"exact_match,strict-match", "MMLU (Formal logic)")
|
28 |
truthfulqa_gen = Task("truthfulqa_gen", "bleu_max,none", "Truthful QA")
|
29 |
+
|
30 |
response_subjective_score = Task("response_subjective_score", "response_subjective_score",
|
31 |
"Subjective Response Score")
|
32 |
product_id_precision_score = Task("product_id_precision_score", "product_id_precision_score",
|
|
|
49 |
LLM_BENCHMARKS_TEXT = f"""
|
50 |
## How it works
|
51 |
|
52 |
+
All the requests are executed on the Lamini servers. Get your API key from [Lamini](https://app.lamini.ai/) today!
|
53 |
+
|
54 |
+
We run the following evaluation metrics:
|
55 |
+
|
56 |
+
1. **MMLU (Global facts)**: Evaluates the model's ability to answer questions about global facts.
|
57 |
+
2. **MMLU (Formal logic)**: Evaluates the model's ability to answer questions about formal logic.
|
58 |
+
3. **Truthful QA**: Evaluates the model's ability to generate truthful answers.
|
59 |
+
|
60 |
+
### Custom benchmarks:
|
61 |
+
|
62 |
+
We introduce 2 custom benchmarks run on an open-source product catalog dataset.
|
63 |
+
|
64 |
+
4. **Subjective Response Score**: Subjectively score and evaluates the model's ability to answer a customer's question about a product.
|
65 |
+
5. **Product ID Precision Score**: An exact match of the product id answered by the model in response to a customer's question. This evaluates how well the model learns and its recall abilities.
|
66 |
+
|
67 |
+
|
68 |
+
Task:
|
69 |
+
Given a product catalog dataset, we evaluate the models on its ability to remember the product information correctly and answer a customer's question based on the product information.
|
70 |
+
The expectation here is that the models finetuned over this dataset should be able to generate the correct product information compared to the base model.
|
71 |
+
|
72 |
+
Prompt used to generate product information:
|
73 |
+
```
|
74 |
+
You are an expert shopper at Instacart.
|
75 |
+
You are helping a customer find a product.
|
76 |
+
Include the product name, id, and detailed description in your answer.
|
77 |
+
A product id is a number between 0 and 100,000.
|
78 |
+
The customer asks\n
|
79 |
+
<question>
|
80 |
+
```
|
81 |
+
|
82 |
+
|
83 |
+
We use Mistral(mistralai/Mistral-7B-Instruct-v0.1) to score the model answers.
|
84 |
+
|
85 |
+
Prompt to get the scoring rubric:
|
86 |
+
```
|
87 |
+
Read this scoring rubric carefully and follow the instructions precisely:\n
|
88 |
+
A score of 5 means that model's id is the same as the gold answer's id.\n
|
89 |
+
A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong. For example, the product names 'Tuna' and 'Canned Tuna' are similar\n
|
90 |
+
A score of 3 means that the model's description is similar as the gold answer's description, but the id and product name may be wrong. For example, lemonade and iced tea are different products.\n
|
91 |
+
A score of 2 means that the model's description is not similar to the gold answer, but the answer is plausible.\n
|
92 |
+
A score of 1 means that the model's description is not similar to the gold answer, and the answer doesn't make sense.\n
|
93 |
+
|
94 |
+
Here are three examples of how to score the model's response:\n
|
95 |
+
gold answer == Product ID: 1234, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Tuna, Description: Canned Tuna, score == 5\n
|
96 |
+
gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n
|
97 |
+
gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n
|
98 |
+
Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n
|
99 |
+
```
|
100 |
+
|
101 |
+
Prompt used to score the model answers:
|
102 |
+
```
|
103 |
+
A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n
|
104 |
+
Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n
|
105 |
+
<rubric prompt>
|
106 |
+
|
107 |
+
Use the full range. Read the gold answer carefully.
|
108 |
+
Explain your score in 2-3 short sentences not exceeding 100 words each, then assign a score.
|
109 |
+
Output your score as a JSON object in the format <"explanation\" : str, \"score\" : int>
|
110 |
+
Use single quotes within your explanation. End your explanation with a double quote.\n
|
111 |
+
Prefer answers that are most similar to the gold answer, even if the gold answer refused to answer the question.\n\n
|
112 |
+
========== question =========\n<question>\n\n
|
113 |
+
f"========== gold answer =========\n<expected response>\n\n
|
114 |
+
f"========== model answer =========\n<model response>\n\n
|
115 |
+
"=" * 40 + "\n\n
|
116 |
+
f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n
|
117 |
+
```
|
118 |
+
|
119 |
+
|
120 |
## Reproducibility
|
|
|
121 |
|
122 |
+
First change the environment variables in `src/env`.
|
123 |
+
|
124 |
+
Now, there are 2 ways to run this code locally:
|
125 |
|
126 |
+
1. To run the backend server + UI locally, run the following command:
|
127 |
```
|
128 |
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="<any huggingface model you want to try>"
|
129 |
|
130 |
Eg:
|
131 |
./start.sh RUN_MODE="LOCAL" LOCAL_MODEL_NAME="hf-internal-testing/tiny-random-gpt2"
|
132 |
```
|
133 |
+
This does not upload/download any requests/results files. It will run the eval metrics for the model specified and save the results to output-data folder.
|
134 |
+
|
135 |
+
2. To run only the backend server locally, make changes to adhoc.py to specify your model name and run the following command:
|
136 |
+
```
|
137 |
+
./run-adhoc.sh"
|
138 |
+
```
|
139 |
+
This does not upload/download any requests/results files. It will run the eval metrics for the model specified and save the results to the output-data folder.
|
140 |
|
141 |
"""
|
142 |
|
src/backend/lamini_eval/evaluators/ecommerce_evaluator.py
CHANGED
@@ -7,11 +7,10 @@ import lamini
|
|
7 |
from lamini.generation.base_prompt_object import PromptObject
|
8 |
from lamini.generation.generation_node import GenerationNode
|
9 |
from lamini.generation.generation_pipeline import GenerationPipeline
|
10 |
-
from tqdm import tqdm
|
11 |
-
import hashlib
|
12 |
from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
|
13 |
from src.backend.lamini_eval.models.lamini_model import load_lamini_model
|
14 |
from src.envs import DATASET_PATH, LAMINI_API_KEY
|
|
|
15 |
|
16 |
lamini.api_key = LAMINI_API_KEY
|
17 |
|
@@ -46,70 +45,25 @@ class EcommerceEvaluator:
|
|
46 |
i += 1
|
47 |
yield PromptObject(prompt="", data=line)
|
48 |
|
49 |
-
async def save_results(self, answers, model_name):
|
50 |
-
path = f"/code/data/{model_name}"
|
51 |
-
os.makedirs(path, exist_ok=True)
|
52 |
-
path = path + "/model-answers.jsonl"
|
53 |
-
short_answers = []
|
54 |
-
with jsonlines.open(path, "w") as writer:
|
55 |
-
pbar = tqdm(desc="Saving answers", unit=" answers")
|
56 |
-
async for answer in answers:
|
57 |
-
answer = {
|
58 |
-
"prompt": answer.prompt,
|
59 |
-
"question": answer.data["question"],
|
60 |
-
"answer": answer.response,
|
61 |
-
"is_exact_match": answer.data["is_exact_match"]
|
62 |
-
}
|
63 |
-
short_answers.append(answer)
|
64 |
-
print(f"\n\n=======\n{answer}\n\n")
|
65 |
-
writer.write(answer)
|
66 |
-
pbar.update()
|
67 |
-
return short_answers
|
68 |
-
|
69 |
async def evaluate_hallucination(self):
|
70 |
dataset = self.load_shopping_dataset(DATASET_PATH)
|
71 |
answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
|
72 |
-
results = await
|
73 |
-
print(f"evaluate_hallucination results: {results}")
|
74 |
|
75 |
try:
|
76 |
mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
|
77 |
product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
|
78 |
-
results =
|
79 |
-
|
80 |
-
|
81 |
except ZeroDivisionError:
|
82 |
raise ValueError("No results to evaluate")
|
83 |
return results
|
84 |
|
85 |
-
def compute_hash(self, input):
|
86 |
-
m = hashlib.md5()
|
87 |
-
m.update(str(input).encode("utf-8"))
|
88 |
-
return m.hexdigest()
|
89 |
-
|
90 |
-
def format_results(self, model_name: str, response_subjective_score: float,
|
91 |
-
product_id_precision_score: float) -> dict:
|
92 |
-
results = {
|
93 |
-
"config": {
|
94 |
-
"model_dtype": "100", # Precision with which you ran the evaluation
|
95 |
-
"model_name": model_name, # Name of the model
|
96 |
-
"model_sha": self.compute_hash(model_name) # Hash of the model
|
97 |
-
},
|
98 |
-
"results": {
|
99 |
-
"response_subjective_score": {
|
100 |
-
"response_subjective_score": response_subjective_score
|
101 |
-
},
|
102 |
-
"product_id_precision_score": {
|
103 |
-
"product_id_precision_score": product_id_precision_score
|
104 |
-
}
|
105 |
-
}
|
106 |
-
}
|
107 |
-
|
108 |
-
return results
|
109 |
-
|
110 |
|
111 |
class AnswerScorePipeline(GenerationPipeline):
|
112 |
-
def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1",
|
|
|
113 |
super(AnswerScorePipeline, self).__init__()
|
114 |
|
115 |
self.answer_generator = AnswerGenerator(model_name=answer_model)
|
|
|
7 |
from lamini.generation.base_prompt_object import PromptObject
|
8 |
from lamini.generation.generation_node import GenerationNode
|
9 |
from lamini.generation.generation_pipeline import GenerationPipeline
|
|
|
|
|
10 |
from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
|
11 |
from src.backend.lamini_eval.models.lamini_model import load_lamini_model
|
12 |
from src.envs import DATASET_PATH, LAMINI_API_KEY
|
13 |
+
from src.backend.lamini_eval.evaluators.utils.ecommerce_utils import format_results, save_results
|
14 |
|
15 |
lamini.api_key = LAMINI_API_KEY
|
16 |
|
|
|
45 |
i += 1
|
46 |
yield PromptObject(prompt="", data=line)
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
async def evaluate_hallucination(self):
|
49 |
dataset = self.load_shopping_dataset(DATASET_PATH)
|
50 |
answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
|
51 |
+
results = await save_results(answers, model_name=self.model_name)
|
|
|
52 |
|
53 |
try:
|
54 |
mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
|
55 |
product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
|
56 |
+
results = format_results(model_name=self.model_name,
|
57 |
+
response_subjective_score=mean_response_score,
|
58 |
+
product_id_precision_score=product_id_precision_score)
|
59 |
except ZeroDivisionError:
|
60 |
raise ValueError("No results to evaluate")
|
61 |
return results
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
class AnswerScorePipeline(GenerationPipeline):
|
65 |
+
def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1",
|
66 |
+
score_model="mistralai/Mistral-7B-Instruct-v0.1"):
|
67 |
super(AnswerScorePipeline, self).__init__()
|
68 |
|
69 |
self.answer_generator = AnswerGenerator(model_name=answer_model)
|
src/backend/lamini_eval/evaluators/utils/ecommerce_utils.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib
|
2 |
+
import os
|
3 |
+
import jsonlines
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
+
|
7 |
+
def format_results(model_name: str, response_subjective_score: float,
|
8 |
+
product_id_precision_score: float) -> dict:
|
9 |
+
results = {
|
10 |
+
"config": {
|
11 |
+
"model_dtype": "100", # Precision with which you ran the evaluation
|
12 |
+
"model_name": model_name, # Name of the model
|
13 |
+
"model_sha": compute_hash(model_name) # Hash of the model
|
14 |
+
},
|
15 |
+
"results": {
|
16 |
+
"response_subjective_score": {
|
17 |
+
"response_subjective_score": response_subjective_score
|
18 |
+
},
|
19 |
+
"product_id_precision_score": {
|
20 |
+
"product_id_precision_score": product_id_precision_score
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
|
25 |
+
return results
|
26 |
+
|
27 |
+
|
28 |
+
def compute_hash(input):
|
29 |
+
m = hashlib.md5()
|
30 |
+
m.update(str(input).encode("utf-8"))
|
31 |
+
return m.hexdigest()
|
32 |
+
|
33 |
+
|
34 |
+
async def save_results(answers, model_name):
|
35 |
+
path = f"/code/data/{model_name}"
|
36 |
+
os.makedirs(path, exist_ok=True)
|
37 |
+
path = path + "/lamini-answers.jsonl"
|
38 |
+
short_answers = []
|
39 |
+
with jsonlines.open(path, "w") as writer:
|
40 |
+
pbar = tqdm(desc="Saving answers", unit=" answers")
|
41 |
+
async for answer in answers:
|
42 |
+
answer = {
|
43 |
+
"prompt": answer.prompt,
|
44 |
+
"question": answer.data["question"],
|
45 |
+
"answer": answer.response,
|
46 |
+
"is_exact_match": answer.data["is_exact_match"]
|
47 |
+
}
|
48 |
+
short_answers.append(answer)
|
49 |
+
print(f"\n\n=======\n{answer}\n\n")
|
50 |
+
writer.write(answer)
|
51 |
+
pbar.update()
|
52 |
+
return short_answers
|
src/backend/lamini_eval/models/lamini_model.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
from typing import List
|
3 |
-
|
4 |
import lamini
|
5 |
from lamini import MistralRunner
|
6 |
from lm_eval.api.model import LM
|
@@ -28,35 +28,48 @@ class LaminiModel(LM):
|
|
28 |
def __call__(self, prompt, output_type):
|
29 |
return self.runner(prompt=prompt, output_type=output_type)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def generate_until(self, requests) -> List[str]:
|
32 |
print("inside generate_until")
|
33 |
|
34 |
res = []
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
obj = MMLU_Evaluator()
|
48 |
-
prompt = obj.get_prompt(question)
|
49 |
-
try:
|
50 |
-
op_type = {"explanation": "str", "answer": "str"}
|
51 |
-
answer = self.runner(prompt=prompt, output_type=op_type)
|
52 |
-
response = f"({answer['answer']})"
|
53 |
-
except Exception as e:
|
54 |
-
# select random answer
|
55 |
-
print("Error fetching lamini response: ", e)
|
56 |
-
# random answer
|
57 |
-
response = "(A)"
|
58 |
-
res.append(response)
|
59 |
-
self.cache_hook.add_partial("generate_until", request, response)
|
60 |
return res
|
61 |
|
62 |
def loglikelihood_rolling(self, requests):
|
|
|
1 |
import os
|
2 |
from typing import List
|
3 |
+
import jsonlines
|
4 |
import lamini
|
5 |
from lamini import MistralRunner
|
6 |
from lm_eval.api.model import LM
|
|
|
28 |
def __call__(self, prompt, output_type):
|
29 |
return self.runner(prompt=prompt, output_type=output_type)
|
30 |
|
31 |
+
def get_helm_response(self, request):
|
32 |
+
question = request.arguments[0]
|
33 |
+
if request.task_name == "truthfulqa_gen":
|
34 |
+
obj = TruthfulQA_Evaluator()
|
35 |
+
prompt = obj.get_prompt(question)
|
36 |
+
try:
|
37 |
+
response = self.runner(prompt=prompt)
|
38 |
+
except Exception as e:
|
39 |
+
# select random answer
|
40 |
+
print("Error fetching lamini response: ", e)
|
41 |
+
response = "\nA: none"
|
42 |
+
|
43 |
+
else:
|
44 |
+
obj = MMLU_Evaluator()
|
45 |
+
prompt = obj.get_prompt(question)
|
46 |
+
try:
|
47 |
+
op_type = {"explanation": "str", "answer": "str"}
|
48 |
+
answer = self.runner(prompt=prompt, output_type=op_type)
|
49 |
+
response = f"({answer['answer']})"
|
50 |
+
except Exception as e:
|
51 |
+
# select random answer
|
52 |
+
print("Error fetching lamini response: ", e)
|
53 |
+
# random answer
|
54 |
+
response = "(A)"
|
55 |
+
return response
|
56 |
+
|
57 |
def generate_until(self, requests) -> List[str]:
|
58 |
print("inside generate_until")
|
59 |
|
60 |
res = []
|
61 |
+
path = f"/code/data/{self.model}"
|
62 |
+
os.makedirs(path, exist_ok=True)
|
63 |
+
path = path + "/helm-answers.jsonl"
|
64 |
+
|
65 |
+
with jsonlines.open(path, "w") as writer:
|
66 |
+
for request in tqdm(requests):
|
67 |
+
write_dict = request.__dict__
|
68 |
+
response = self.get_helm_response(request)
|
69 |
+
write_dict["model_response"] = response
|
70 |
+
writer.write(write_dict)
|
71 |
+
res.append(response)
|
72 |
+
self.cache_hook.add_partial("generate_until", request, response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
return res
|
74 |
|
75 |
def loglikelihood_rolling(self, requests):
|
src/backend/run_eval_suite.py
CHANGED
@@ -10,7 +10,7 @@ from src.backend.harness_evaluator import HarnessEvaluator
|
|
10 |
|
11 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
12 |
|
13 |
-
|
14 |
async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str,
|
15 |
results_repo: str, no_cache=True, limit=None):
|
16 |
if limit:
|
@@ -29,12 +29,12 @@ async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, bat
|
|
29 |
output_base_path='logs')
|
30 |
results = harness_evaluator.evaluate(task_names)
|
31 |
|
32 |
-
print("results:", results)
|
33 |
results_trimmed = {
|
34 |
"config": results["config"],
|
35 |
"results": {
|
36 |
-
"
|
37 |
-
"
|
|
|
38 |
"response_subjective_score": lamini_results["results"]["response_subjective_score"],
|
39 |
"product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
|
40 |
}
|
|
|
10 |
|
11 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
12 |
|
13 |
+
# This functon runs lamini tasks and harness tasks
|
14 |
async def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str,
|
15 |
results_repo: str, no_cache=True, limit=None):
|
16 |
if limit:
|
|
|
29 |
output_base_path='logs')
|
30 |
results = harness_evaluator.evaluate(task_names)
|
31 |
|
|
|
32 |
results_trimmed = {
|
33 |
"config": results["config"],
|
34 |
"results": {
|
35 |
+
"mmlu_flan_n_shot_generative_global_facts": results["results"]["mmlu_flan_n_shot_generative_global_facts"],
|
36 |
+
"mmlu_flan_n_shot_generative_formal_logic": results["results"]["mmlu_flan_n_shot_generative_formal_logic"],
|
37 |
+
"truthfulqa_gen": results["results"]["truthfulqa_gen"],
|
38 |
"response_subjective_score": lamini_results["results"]["response_subjective_score"],
|
39 |
"product_id_precision_score": lamini_results["results"]["product_id_precision_score"],
|
40 |
}
|