Spaces:

lamini
/

leaderboard

Running on CPU Upgrade

App Files Files Community

ayushi0430 commited on Mar 15

Commit

6c9df03

•

1 Parent(s): efe0c89

add sys tags, pass model

Browse files

Files changed (5) hide show

generation.py +0 -1
main_backend.py +5 -3
src/backend/lamini_eval/evaluators/ecommerce_evaluator.py +36 -31
src/leaderboard/read_evals.py +1 -1
start.sh +3 -1

generation.py CHANGED Viewed

@@ -2,7 +2,6 @@ from lamini.generation.generation_node import GenerationNode
 from lamini.generation.generation_pipeline import GenerationPipeline
 from lamini.generation.base_prompt_object import PromptObject
 from typing import Union, Iterator, AsyncIterator
-from src.backend.lamini_eval.datasets.ecommerce.shopping_dataset import load_shopping_dataset
 from src.envs import DATASET_PATH, LAMINI_API_KEY
 import logging
 import asyncio

 from lamini.generation.generation_pipeline import GenerationPipeline
 from lamini.generation.base_prompt_object import PromptObject
 from typing import Union, Iterator, AsyncIterator
 from src.envs import DATASET_PATH, LAMINI_API_KEY
 import logging
 import asyncio

main_backend.py CHANGED Viewed

@@ -28,11 +28,13 @@ RUNNING_STATUS = "RUNNING"
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
 # TODO: uncomment
 if RUN_MODE != "LOCAL":
-    snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]

 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
 # TODO: uncomment
 if RUN_MODE != "LOCAL":
+    snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset",
+                      max_workers=60, token=TOKEN)
+    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset",
+                      max_workers=60, token=TOKEN)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]

src/backend/lamini_eval/evaluators/ecommerce_evaluator.py CHANGED Viewed

@@ -7,10 +7,9 @@ from lamini.generation.base_prompt_object import PromptObject
 from lamini.generation.generation_node import GenerationNode
 from lamini.generation.generation_pipeline import GenerationPipeline
 from tqdm import tqdm
-from src.backend.lamini_eval.datasets.ecommerce.shopping_dataset import load_shopping_dataset
-from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
-from src.backend.lamini_eval.models.lamini_model import load_lamini_model
 from src.envs import DATASET_PATH, LAMINI_API_KEY
 lamini.api_key = LAMINI_API_KEY
@@ -31,11 +30,11 @@ class EcommerceEvaluator:
         self.max_examples = max_examples
         self.batch_size = batch_size
-    def load_model(self):
-        if self.model_type == "gpt4":
-            return load_gpt4_model()
-        return load_lamini_model(self.model_name)
     async def load_shopping_dataset(self, path):
         i = 0
@@ -59,20 +58,15 @@ class EcommerceEvaluator:
                     "is_exact_match": answer.data["is_exact_match"]
                 }
                 short_answers.append(answer)
-                print(f"\n\n=======\n{answer}\n\n")
                 writer.write(answer)
                 pbar.update()
         return short_answers
-    async def evaluate(self):
-        dataset = load_shopping_dataset(DATASET_PATH)
-        answers = AnswerScorePipeline().call(dataset)
-        short_answers = await self.save_results(answers)
-        return short_answers
     async def evaluate_hallucination(self):
-        results = await self.evaluate()
         mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
         product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
         results = self.format_results(model_name=self.model_name,
@@ -80,13 +74,18 @@ class EcommerceEvaluator:
                                       product_id_precision_score=product_id_precision_score)
         return results
     def format_results(self, model_name: str, response_subjective_score: float,
                        product_id_precision_score: float) -> dict:
         results = {
             "config": {
                 "model_dtype": "100",  # Precision with which you ran the evaluation
                 "model_name": model_name,  # Name of the model
-                "model_sha": "xuz"  # Hash of the model
             },
             "results": {
                 "response_subjective_score": {
@@ -102,11 +101,11 @@ class EcommerceEvaluator:
 class AnswerScorePipeline(GenerationPipeline):
-    def __init__(self):
         super(AnswerScorePipeline, self).__init__()
-        self.answer_generator = AnswerGenerator()
-        self.score_generator = ScoreGenerator()
     def forward(self, x):
         ans = self.answer_generator(x)
@@ -115,9 +114,10 @@ class AnswerScorePipeline(GenerationPipeline):
 class AnswerGenerator(GenerationNode):
-    def __init__(self):
         super(AnswerGenerator, self).__init__(
-            model_name="mistralai/Mistral-7B-Instruct-v0.1", max_tokens=150
         )
     def generate(
@@ -155,22 +155,25 @@ class AnswerGenerator(GenerationNode):
             yield prompt
     def make_prompt(self, chunk):
         prompt = (
-            "You are an expert shopper at Instacart."
         )
         prompt += "You are helping a customer find a product. "
         prompt += "Include the product name, id, and detailed description in your answer. "
         prompt += "A product id is a number between 0 and 100,000. "
         prompt += "The customer asks\n"
         prompt += chunk.data["question"]
         print("answer prompt:", prompt)
         return prompt
 class ScoreGenerator(GenerationNode):
-    def __init__(self):
         super(ScoreGenerator, self).__init__(
-            model_name="mistralai/Mistral-7B-Instruct-v0.1", max_tokens=150
         )
     def generate(
@@ -193,8 +196,9 @@ class ScoreGenerator(GenerationNode):
             yield result
     def get_rubric(self):
         rubric = (
-            "Read this scoring rubric carefully and follow the instructions precisely:\n\n"
         )
         rubric += "A score of 5 means that model's id is the same as the gold answer's id.\n"
         rubric += "A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong.  For example, the product names 'Tuna' and 'Canned Tuna' are similar\n"
@@ -207,7 +211,7 @@ class ScoreGenerator(GenerationNode):
         rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n"
         rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n"
         rubric += "Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n"
         return rubric
     def is_exact_match(self, prompt):
@@ -247,8 +251,9 @@ class ScoreGenerator(GenerationNode):
         return formatted_response
     def make_prompt(self, chunk):
         prompt = (
-            "A model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n\n"
         )
         prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
         prompt += chunk.data["rubric"]
@@ -262,6 +267,6 @@ class ScoreGenerator(GenerationNode):
         prompt += f"========== model answer =========\n{chunk.data['response']}\n\n"
         prompt += "=" * 40 + "\n\n"
         prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
         print("score prompt:", prompt)
         return prompt

 from lamini.generation.generation_node import GenerationNode
 from lamini.generation.generation_pipeline import GenerationPipeline
 from tqdm import tqdm
+import hashlib
+# from src.backend.lamini_eval.models.gpt4_model import load_gpt4_model
+# from src.backend.lamini_eval.models.lamini_model import load_lamini_model
 from src.envs import DATASET_PATH, LAMINI_API_KEY
 lamini.api_key = LAMINI_API_KEY
         self.max_examples = max_examples
         self.batch_size = batch_size
+    # def load_model(self):
+    #     if self.model_type == "gpt4":
+    #         return load_gpt4_model()
+    #
+    #     return load_lamini_model(self.model_name)
     async def load_shopping_dataset(self, path):
         i = 0
                     "is_exact_match": answer.data["is_exact_match"]
                 }
                 short_answers.append(answer)
+                # print(f"\n\n=======\n{answer}\n\n")
                 writer.write(answer)
                 pbar.update()
         return short_answers
     async def evaluate_hallucination(self):
+        dataset = self.load_shopping_dataset(DATASET_PATH)
+        answers = AnswerScorePipeline(answer_model=self.model_name).call(dataset)
+        results = await self.save_results(answers)
         mean_response_score = sum([item["answer"]["score"] for item in results]) / len(results)
         product_id_precision_score = sum([int(item["is_exact_match"]) for item in results]) / len(results)
         results = self.format_results(model_name=self.model_name,
                                       product_id_precision_score=product_id_precision_score)
         return results
+    def compute_hash(self, input):
+        m = hashlib.md5()
+        m.update(str(input).encode("utf-8"))
+        return m.hexdigest()
     def format_results(self, model_name: str, response_subjective_score: float,
                        product_id_precision_score: float) -> dict:
         results = {
             "config": {
                 "model_dtype": "100",  # Precision with which you ran the evaluation
                 "model_name": model_name,  # Name of the model
+                "model_sha": self.compute_hash(model_name)  # Hash of the model
             },
             "results": {
                 "response_subjective_score": {
 class AnswerScorePipeline(GenerationPipeline):
+    def __init__(self, answer_model="mistralai/Mistral-7B-Instruct-v0.1", score_model="mistralai/Mistral-7B-Instruct-v0.1"):
         super(AnswerScorePipeline, self).__init__()
+        self.answer_generator = AnswerGenerator(model_name=answer_model)
+        self.score_generator = ScoreGenerator(model_name=score_model)
     def forward(self, x):
         ans = self.answer_generator(x)
 class AnswerGenerator(GenerationNode):
+    def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
         super(AnswerGenerator, self).__init__(
+            model_name,
+            max_tokens=150
         )
     def generate(
             yield prompt
     def make_prompt(self, chunk):
+        # <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
         prompt = (
+            "<s>[INST] <<SYS>>\nYou are an expert shopper at Instacart.\n<</SYS>>\n\n"
         )
         prompt += "You are helping a customer find a product. "
         prompt += "Include the product name, id, and detailed description in your answer. "
         prompt += "A product id is a number between 0 and 100,000. "
         prompt += "The customer asks\n"
         prompt += chunk.data["question"]
+        prompt += " [/INST]"
         print("answer prompt:", prompt)
         return prompt
 class ScoreGenerator(GenerationNode):
+    def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
         super(ScoreGenerator, self).__init__(
+            model_name=model_name,
+            max_tokens=150
         )
     def generate(
             yield result
     def get_rubric(self):
+        # <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
         rubric = (
+            " <s>[INST] <<SYS>>\nRead this scoring rubric carefully and follow the instructions precisely:\n<</SYS>>\n\n"
         )
         rubric += "A score of 5 means that model's id is the same as the gold answer's id.\n"
         rubric += "A score of 4 means that the model's product name is the same or a paraphrase of the gold answer, but the id may be wrong.  For example, the product names 'Tuna' and 'Canned Tuna' are similar\n"
         rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Canned Tuna, Description: Tuna, score == 4\n"
         rubric += "gold answer == Product ID: 5678, Product Name: Tuna, Description: Canned Tuna, model response == Product ID: 1234, Product Name: Bubble Gum, Description: Delicious treat, score == 1\n"
         rubric += "Assign a 5 even if fields are missing, for example: gold answer == Product ID: 1234, model response == Product ID: 1234, score == 5\n"
+        rubric += " [/INST]"
         return rubric
     def is_exact_match(self, prompt):
         return formatted_response
     def make_prompt(self, chunk):
+        # <s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n{user} [/INST]
         prompt = (
+            "<s>[INST] <<SYS>>\nA model is going to answer a question. Your job is to score the answer, comparing it to a golden reference. You are an expert scorer.\n<</SYS>>\n\n"
         )
         prompt += f"Rate the answer using a score from 1 (lowest match) to 5 (highest match).\n"
         prompt += chunk.data["rubric"]
         prompt += f"========== model answer =========\n{chunk.data['response']}\n\n"
         prompt += "=" * 40 + "\n\n"
         prompt += f"How would you score the model's answer compared to the gold answer (using the 1-5 scale defined above)?\n\n"
+        prompt += " [/INST]"
         print("score prompt:", prompt)
         return prompt

src/leaderboard/read_evals.py CHANGED Viewed

@@ -74,7 +74,7 @@ class EvalResult:
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs)
             results[task.benchmark] = mean_acc
         return self(

             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(

start.sh CHANGED Viewed

@@ -9,6 +9,8 @@
 # -o pipefail | produces a failure code if any stage fails
 set -Eeuoxa pipefail
 for ARGUMENT in "$@"
 do
    KEY=$(echo $ARGUMENT | cut -f1 -d=)
@@ -24,4 +26,4 @@ echo "Run mode is: $RUN_MODE"
 echo "Model passed is: $LOCAL_MODEL_NAME"
 docker buildx build --platform=linux/amd64 -t ldr .
-docker run -it --rm -p 7860:7860 --platform=linux/amd64 -e RUN_MODE=$RUN_MODE -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME ldr python app.py

 # -o pipefail | produces a failure code if any stage fails
 set -Eeuoxa pipefail
+LOCAL_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 for ARGUMENT in "$@"
 do
    KEY=$(echo $ARGUMENT | cut -f1 -d=)
 echo "Model passed is: $LOCAL_MODEL_NAME"
 docker buildx build --platform=linux/amd64 -t ldr .
+docker run -it --rm -p 7860:7860 --platform=linux/amd64 -v $LOCAL_DIRECTORY/output-data:/code/data -e RUN_MODE=$RUN_MODE -e LOCAL_MODEL_NAME=$LOCAL_MODEL_NAME ldr python app.py