leaderboard

Runtime error

App Files Files Community

Minseok Bae commited on Jan 4, 2024

Commit

2864204

1 Parent(s): 404587d

Implemented litellm pipeline

Browse files

Files changed (4) hide show

requirements.txt +1 -0
src/backend/model_operations.py +54 -37
src/display/about.py +4 -4
src/envs.py +3 -0

requirements.txt CHANGED Viewed

@@ -5,6 +5,7 @@ datasets==2.14.5
 gradio==4.4.0
 gradio_client==0.7.0
 huggingface-hub>=0.18.0
 matplotlib==3.7.1
 numpy==1.24.2
 pandas==2.0.0

 gradio==4.4.0
 gradio_client==0.7.0
 huggingface-hub>=0.18.0
+litellm==1.15.1
 matplotlib==3.7.1
 numpy==1.24.2
 pandas==2.0.0

src/backend/model_operations.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import logging
 import numpy as np
 import pandas as pd
 import spacy
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import CrossEncoder
 import src.backend.util as util
 # Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO,
@@ -15,6 +20,8 @@ logging.basicConfig(level=logging.INFO,
 # Load spacy model for word tokenization
 nlp = spacy.load("en_core_web_sm")
 def load_evaluation_model(model_path):
     """Load the evaluation model from the given path
@@ -29,6 +36,18 @@ def load_evaluation_model(model_path):
     return model
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
@@ -42,12 +61,13 @@ class ModelLoadingException(Exception):
         self.revision = revision
         super().__init__(f"{messages} id={model_id} revision={revision}")
 class SummaryGenerator:
     """A class to generate summaries using a causal language model.
     Attributes:
-        tokenizer (AutoTokenizer): Tokenizer for the model.
-        model (AutoModelForCausalLM): The causal language model.
         summaries_df (DataFrame): DataFrame to store generated summaries.
         revision (str): Model revision.
         avg_length (float): Average length of summaries.
@@ -62,17 +82,13 @@ class SummaryGenerator:
             model_id (str): Identifier for the model.
             revision (str): Revision of the model.
         """
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
-            self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
-        except Exception as e:
-            logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}")
-            raise ModelLoadingException(model_id, revision) from e
         self.summaries_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
-        self.error_rate = None
     def generate_summaries(self, df):
         """Generate summaries for a given DataFrame of source docs.
@@ -84,34 +100,43 @@ class SummaryGenerator:
             summaries_df (DataFrame): Generated summaries by the model.
         """
         source, summary, dataset = [], [], []
-        error_count = 0
         for index, row in df.iterrows():
             _source = row['text']
             _dataset = row['dataset']
-            prompt = util.generate_prompt(_source)
-            inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
-                                    revision=self.revision)
-            try:
-                outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
-                                            temperature=0.0, revision=self.revision)
-                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True,
-                                                revision=self.revision)
-            except Exception as e:
-                print(f"Error at index {index}: {e}")
-                response = ""
-                error_count += 1
-            summary.append(response)
-            source.append(_source)
-            dataset.append(_dataset)
         self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
                                         columns=["source", "summary", "dataset"])
         self._compute_avg_length()
         self._compute_answer_rate()
-        # self._compute_error_rate(error_count)
         return self.summaries_df
@@ -140,14 +165,6 @@ class SummaryGenerator:
         self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
-    # def _compute_error_rate(self, count):
-    #     """
-    #     Compute the error rate of summaries.
-    #     """
-    #     total_rows = len(self.summaries_df)
-    #     self.error_rate = 0 if total_rows == 0 else count / total_rows
 class EvaluationModel:
     """A class to evaluate generated summaries.

+import os
+import time
+from datetime import datetime
 import logging
 import numpy as np
 import pandas as pd
 import spacy
+# from transformers import AutoModelForCausalLM, AutoTokenizer
 from sentence_transformers import CrossEncoder
+from litellm import completion
 import src.backend.util as util
+import src.envs as envs
 # Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO,
 # Load spacy model for word tokenization
 nlp = spacy.load("en_core_web_sm")
+os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
 def load_evaluation_model(model_path):
     """Load the evaluation model from the given path
     return model
+def generate_summary(model: str, system_prompt: str, user_prompt: str, api_base: str):
+    response = completion(
+        model=model,
+        messages=[{"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}],
+        temperature=0.0,
+        max_tokens=1024,
+        api_base=api_base,
+    )
+    return response['choices'][0]['message']['content']
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
         self.revision = revision
         super().__init__(f"{messages} id={model_id} revision={revision}")
 class SummaryGenerator:
     """A class to generate summaries using a causal language model.
     Attributes:
+        model (str): huggingface/{model_id}
+        api_base (str): https://api-inference.huggingface.co/models/{model_id}
         summaries_df (DataFrame): DataFrame to store generated summaries.
         revision (str): Model revision.
         avg_length (float): Average length of summaries.
             model_id (str): Identifier for the model.
             revision (str): Revision of the model.
         """
+        self.model = f"huggingface/{model_id}"
+        self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
         self.summaries_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
+        self.exceptions = None
     def generate_summaries(self, df):
         """Generate summaries for a given DataFrame of source docs.
             summaries_df (DataFrame): Generated summaries by the model.
         """
         source, summary, dataset = [], [], []
+        exceptions = []
         for index, row in df.iterrows():
             _source = row['text']
             _dataset = row['dataset']
+            system_prompt = envs.SYSTEM_PROMPT
+            user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
+            while True:
+                try:
+                    _summary = generate_summary(self.model, system_prompt,
+                                                user_prompt, self.api_base)
+                    break
+                except Exception as e:
+                    if 'Rate limit reached' in str(e):
+                        wait_time = 3660
+                        current_time = datetime.now().strftime('%H:%M:%S')
+                        print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
+                        time.sleep(wait_time)
+                    else:
+                        print(f"Error at index {index}: {e}")
+                        _summary = ""
+                        exceptions.append(index)
+                        break
+        summary.append(_summary)
+        source.append(_source)
+        dataset.append(_dataset)
+        time.sleep(1)
         self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
                                         columns=["source", "summary", "dataset"])
+        self.exceptions = exceptions
         self._compute_avg_length()
         self._compute_answer_rate()
         return self.summaries_df
         self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
 class EvaluationModel:
     """A class to evaluate generated summaries.

src/display/about.py CHANGED Viewed

@@ -43,10 +43,10 @@ Our evaluation dataset is composed of 1006 documents from multiple public datase
 We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
 ## Understand each metric
-### - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
-### - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
-### - Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
-### - Average Summary Length: The average number of words in the generated summaries
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

 We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
 ## Understand each metric
+- Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
+- Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
+- Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
+- Average Summary Length: The average number of words in the generated summaries
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

src/envs.py CHANGED Viewed

@@ -24,3 +24,6 @@ API = HfApi(token=TOKEN)
 SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
 SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
 HEM_PATH = 'vectara/hallucination_evaluation_model'

 SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
 SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
 HEM_PATH = 'vectara/hallucination_evaluation_model'
+SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
+USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "