File size: 4,682 Bytes
8205000
f135bfe
538a92e
 
 
 
 
 
 
 
10512bb
538a92e
 
 
089ac4a
8205000
 
 
 
f135bfe
 
f9daabb
538a92e
f9daabb
538a92e
 
 
 
 
 
0c6b6a2
 
538a92e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15e72cb
538a92e
 
 
 
 
 
 
 
4906e39
538a92e
 
 
 
9872f92
538a92e
 
 
 
 
4906e39
538a92e
8205000
 
f135bfe
 
 
 
538a92e
f135bfe
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from fastapi import FastAPI
from pydantic import BaseModel
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain import PromptTemplate


import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"



#print("Testing teh build")
# NOTE - we configure docs_url to serve the interactive Docs at the root path
# of the app. This way, we can use the docs as a landing page for the app on Spaces.
app = FastAPI(docs_url="/")

class ModelOutputEvaluate(BaseModel):
    question: str
    answer: str
    domain: str
    context: str

class BasePromptContext:
    def __init__(self):
        self.variables_list = ["question","answer","context"]
        self.base_template = """Please act as an impartial judge and evaluate the quality of the provided answer which attempts to answer the provided question based on a provided context.

And you'll need to submit your grading for the correctness, comprehensiveness and readability of the answer, using JSON format with the 2 items in parenthesis:
("score": [your score number for the correctness of the answer], "reasoning": [your one line step by step reasoning about the correctness of the answer])

  Below is your grading rubric:

- Correctness: If the answer correctly answer the question, below are the details for different scores:
  - Score 0: the answer is completely incorrect, doesn’t mention anything about the question or is completely contrary to the correct answer.
      - For example, when asked “How to terminate a databricks cluster”, the answer is empty string, or content that’s completely irrelevant, or sorry I don’t know the answer.
  - Score 4: the answer provides some relevance to the question and answer one aspect of the question correctly.
      - Example:
          - Question: How to terminate a databricks cluster
          - Answer: Databricks cluster is a cloud-based computing environment that allows users to process big data and run distributed data processing tasks efficiently.
          - Or answer:  In the Databricks workspace, navigate to the "Clusters" tab. And then this is a hard question that I need to think more about it
  - Score 7: the answer mostly answer the question but is missing or hallucinating on one critical aspect.
      - Example:
          - Question: How to terminate a databricks cluster”
          - Answer: “In the Databricks workspace, navigate to the "Clusters" tab.
          Find the cluster you want to terminate from the list of active clusters.
          And then you’ll find a button to terminate all clusters at once”
  - Score 10: the answer correctly answer the question and not missing any major aspect
      - Example:
          - Question: How to terminate a databricks cluster
          - Answer: In the Databricks workspace, navigate to the "Clusters" tab.
          Find the cluster you want to terminate from the list of active clusters.
          Click on the down-arrow next to the cluster name to open the cluster details.
          Click on the "Terminate" button. A confirmation dialog will appear. Click "Terminate" again to confirm the action.”

Provided question:
{question}

Provided answer:
{answer}

Provided context:
{context}

Please provide your grading for the correctness"""
        
        
class Evaluater:
    def __init__(self, item: ModelOutputEvaluate):
        self.question = item.question
        self.answer = item.answer
        self.domain = item.domain
        self.context = item.context
        self.llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000})

    def get_prompt_template(self):
        prompt = BasePromptContext()
        template = prompt.base_template
        varialbles = prompt.variables_list
        eval_template = PromptTemplate(input_variables=varialbles, template=template)
        return eval_template

    def evaluate(self):
        prompt = self.get_prompt_template().format(question = self.question, answer = self.answer, context = self.context)
        score = self.llm(prompt)
        return score

# Create extractor instance
@app.post("/evaluate/")
async def create_evaluation_scenario(item: ModelOutputEvaluate):
    output = {
        "input": item,
        "score" : Evaluater(item).evaluate()
    }
    return output
# def evaluate(question: str):
#     # question = "what is the document about?"
#     answer = search(question)
#     # print(question, answer)
#     return {answer}