File size: 5,656 Bytes
26e5d37
 
 
677d730
8ec916f
 
26e5d37
677d730
26e5d37
 
 
a824423
26e5d37
 
 
 
 
 
8ec916f
26e5d37
 
8ec916f
26e5d37
 
8ec916f
26e5d37
 
8ec916f
26e5d37
8ec916f
26e5d37
 
 
 
 
 
a824423
26e5d37
 
8ec916f
26e5d37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec916f
26e5d37
831981c
26e5d37
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import spaces
import gradio as gr
from transformers import AutoTokenizer, LlamaForCausalLM
import os
import fastchat
from fastchat.conversation import get_conv_template

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
model = LlamaForCausalLM.from_pretrained("kaist-ai/Prometheus-13b-v1.0", device_map="auto", load_in_8bit=True)

@spaces.GPU
def evaluate_task(instruction_to_evaluate, response_to_evaluate, reference_answer, criteria_description, score1_description, score2_description, score3_description, score4_description, score5_description):
    input_text = f"""###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)\"
4. Please do not generate any other opening, closing, and explanations.

###The instruction to evaluate:
{instruction_to_evaluate}

###Response to evaluate:
{response_to_evaluate} 

###Reference Answer (Score 5):
{reference_answer} 

###Score Rubrics:
{criteria_description}
Score 1: {score1_description}
Score 2: {score2_description}
Score 3: {score3_description}
Score 4: {score4_description}
Score 5: {score5_description}
###Feedback:"""

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

    outputs = model.generate(input_ids, sample=True, temperature=1.0, top_p=0.9, max_new_tokens=650, repetition_penalty=1.03)
    result = tokenizer.decode(outputs[0])

    return result

iface = gr.Interface(
    fn=evaluate_task,
    inputs=[
        gr.Textbox(label="Instruction to Evaluate", placeholder="Enter Instruction Here...", lines=5),
        gr.Textbox(label="Response to Evaluate", placeholder="Enter Response Here...", lines=5),
        gr.Textbox(label="Reference Answer", placeholder="Enter Reference Answer Here...", lines=5),
        gr.Textbox(label="Criteria Description", placeholder="Enter Criteria Description Here...", lines=2),
        gr.Textbox(label="Score 1 Description", placeholder="Enter Score 1 Description Here...", lines=2),
        gr.Textbox(label="Score 2 Description", placeholder="Enter Score 2 Description Here...", lines=2),
        gr.Textbox(label="Score 3 Description", placeholder="Enter Score 3 Description Here...", lines=2),
        gr.Textbox(label="Score 4 Description", placeholder="Enter Score 4 Description Here...", lines=2),
        gr.Textbox(label="Score 5 Description", placeholder="Enter Score 5 Description Here...", lines=2)
    ],
    outputs="🎏KAIST-AI/⚖️Prometheus",
    title="Welcome to🌟Tonic's⚖️Prometheus",
    description="[🎏KAIST-AI/⚖️Prometheus](https://huggingface.co/kaist-ai/prometheus-13b-v1.0) Prometheus is an alternative of GPT-4 evaluation when doing fine-grained evaluation of an underlying LLM & a Reward model for Reinforcement Learning from Human Feedback (RLHF). You can use this demo to try out their model !  You can also use [🎏KAIST-AI/⚖️Prometheus](https://huggingface.co/kaist-ai/prometheus-13b-v1.0) [by cloning this space](https://huggingface.co/spaces/Tonic/prometheus/tree/main?clone=true). [🧬🔬🔍 Simply click here: 🤗](https://huggingface.co/spaces/Tonic/prometheus?duplicate=true) Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to 🌟 [DataTonic](https://github.com/Tonic-AI/DataTonic) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗",
    examples=[
        ["Your boss seems to be quite critical. How do you feel about your work environment?", "I feel like my ideas are not valued and I'm overwhelmed.", "I understand that feeling undervalued and overwhelmed can be very challenging. It's important to communicate your feelings and find strategies to manage your workload.", "Evaluate the emotional context and provide supportive advice.", "Does not recognize emotional context.", "Acknowledges emotions but offers no support.", "Recognizes emotions and offers generic advice.", "Shows understanding and provides specific advice.", "Deeply empathizes and offers comprehensive, personalized advice."],
        ["You mentioned feeling lonely in the new city. What have you tried to meet new people?", "I haven't really tried anything yet. I'm not sure where to start.", "It's completely normal to feel unsure at the beginning. Exploring local events or online communities related to your interests can be a great start.", "Assess the initiative for social connections and suggest actionable steps.", "Ignores the issue of loneliness.", "Briefly acknowledges loneliness without suggestions.", "Mentions loneliness and suggests a common idea.", "Understands the challenge and offers multiple suggestions.", "Empathetically addresses the feeling and provides tailored, actionable advice."]
    ]
)

iface.launch()