File size: 5,625 Bytes
422e042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eddcee7
f502fcb
 
eddcee7
f502fcb
 
422e042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe68a7a
422e042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a5510e
422e042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86478a5
422e042
86478a5
422e042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86478a5
422e042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# login as a privileged user.
import os
HF_TOKEN = os.environ.get("HF_TOKEN")

from huggingface_hub import login
login(token=HF_TOKEN)

from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

from pyreft import ReftModel, get_intervention_locations

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

DESCRIPTION = """\
# ReFT-Chat (Llama-2 7B with 1K examples)

### What's ReFT-Chat?
ReFT-Chat is a chatbot built with ReFT and Llama-2 7B. It is trained with 1K training examples from the unpaired [Ultrafeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). It is not good at multi-turn conversations. You can train your own ReFT agent and share it on HuggingFace by following this [tutorial](https://github.com/stanfordnlp/pyreft/tree/main/examples/gradio/train_and_share.ipynb)!

### Usage Terms
This should only be used for research purposes. We did not conduct additional safety training with ReFT. We evaluate this model using [Alpaca-eval](https://github.com/tatsu-lab/alpaca_eval). Performance results can be found in [our ReFT paper](https://arxiv.org/abs/2404.03592). Our model inherits all the underlying risks associated with Llama. See terms outlined below.
"""

LICENSE = """
<p/>

---
As a derivate work of [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta,
this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
"""

if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


if torch.cuda.is_available():
    model_id = "meta-llama/Llama-2-7b-hf"
    model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="cuda", torch_dtype=torch.bfloat16
    )
    reft_model = ReftModel.load("pyvene/reft_chat7b_1k", model, from_huggingface_hub=True)
    reft_model.set_device("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = True

prompt_no_input_template = """Below is an instruction that \
describes a task. Write a response that appropriately \
completes the request.

### Instruction:
%s

### Response:
"""

@spaces.GPU
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:

    # tokenize and prepare the input
    conversation = []
    for user, assistant in chat_history:
        conversation += [f"user: {user} assistant : {assistant}"]
    conversation += [message]
    conversation = "\n".join(conversation)
    prompt = prompt_no_input_template % conversation
    prompt = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_ids = prompt["input_ids"]
    attention_mask = prompt["attention_mask"]

    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        attention_mask = attention_mask[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

    intervention_locations = torch.tensor([get_intervention_locations(
        last_position=input_ids.shape[-1], positions="f5+l5",
        num_interventions=len(reft_model.interventions))]).permute(1, 0, 2).tolist()
    
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = {
        "base": {"input_ids": prompt["input_ids"], "attention_mask": prompt["attention_mask"]},
        "unit_locations": {"sources->base": (None, intervention_locations)},
        "intervene_on_prompt": True,
        "streamer": streamer,
        "max_new_tokens": max_new_tokens,
        "eos_token_id": tokenizer.eos_token_id,
        "early_stopping": True,
        "no_repeat_ngram_size": 5,
        "repetition_penalty": repetition_penalty,
        "do_sample": False,
    }

    t = Thread(target=reft_model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.1,
        ),
    ],
    stop_btn=None,
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly to me what is the Python programming language?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
    chat_interface.render()
    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()