File size: 4,476 Bytes
fb38431
c4f947a
 
 
 
0439661
c4f947a
 
5315eed
c778ae5
996c7bc
0439661
 
 
4d07925
c778ae5
b461977
 
29a2d0c
 
0439661
b461977
 
 
c778ae5
0439661
 
 
 
0f36821
0439661
 
 
 
 
 
c4f947a
0439661
 
 
 
 
 
c4f947a
 
0439661
 
ec6b66c
0f36821
ec6b66c
 
 
 
0439661
0f36821
c4f947a
0439661
c4f947a
 
 
 
0439661
 
0f36821
0439661
 
 
 
c4f947a
 
 
 
 
 
 
 
5315eed
b461977
 
 
 
 
 
 
 
0f36821
0439661
 
0f36821
0439661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b461977
 
839fca3
 
ff04433
 
839fca3
 
 
0f36821
839fca3
 
ff04433
 
839fca3
 
0f36821
 
839fca3
 
 
0f36821
839fca3
 
 
 
b461977
 
5315eed
 
c3faf6b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    StoppingCriteria,
)
from threading import Thread
import gradio as gr

has_gpu = torch.cuda.is_available()
device = "cuda" if has_gpu else "cpu"

torch.set_default_device(device)

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    # torch_dtype=torch.float16 if has_gpu else torch.float32,
    torch_dtype=torch.float32,
    device_map=device,
    trust_remote_code=True,
)


# custom stopping criteria (avoid generating hallucinated prompts)
# still includes these tokens in the output but stops generating after them
class Phi2StoppingCriteria(StoppingCriteria):
    def __init__(self):
        stop_list = ["Exercise", "Exercises", "exercises:", "<|endoftext|>"]
        tokenphrases = []
        for token in stop_list:
            tokenphrases.append(
                tokenizer(token, return_tensors="pt").input_ids[0].tolist()
            )
        self.tokenphrases = tokenphrases

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for tokenphrase in self.tokenphrases:
            if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]:
                return True


def generate(
    prompt,
    max_new_tokens=75,
    terminate_hallucinated_prompts=True,
    sampling=False,
    temperature=1.0,
    top_k=50,
    top_p=1.0,
):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
    streamer = TextIteratorStreamer(tokenizer)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=sampling,
        stopping_criteria=[Phi2StoppingCriteria()]
        if terminate_hallucinated_prompts
        else None,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
    )
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    model_output = ""
    for new_text in streamer:
        model_output += new_text
        yield model_output
    return model_output


demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(
            label="prompt",
            value="Write a detailed analogy between mathematics and a lighthouse.",
        ),
        gr.Slider(minimum=0, maximum=500, step=1, value=50, label="max new tokens"),
        gr.Checkbox(
            value=True,
            label="terminate hallucinated prompts",
            info="stop generation after getting tokens like 'Exercise' or '<|endoftext|>, but will not remove them.",
        ),
        gr.Checkbox(
            label="do sampling",
            info="introduce randomness for non-deterministic results. required for below options",
            value=True,
        ),
        gr.Slider(
            label="temperature",
            info="higher temperature means more randomness",
            value=1.0,
            minimum=0.1,
            maximum=1.5,
            step=0.1,
        ),
        gr.Slider(
            label="top-k",
            info="consider only the k most likely tokens",
            value=50,
            minimum=1,
            maximum=100,
            step=1,
        ),
        gr.Slider(
            label="top-p",
            info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p",
            value=1.0,
            minimum=0.1,
            maximum=1.0,
            step=0.1,
        ),
    ],
    outputs="text",
    examples=[
        [
            "Write a detailed analogy between mathematics and a lighthouse.",
            75,
        ],
        [
            "Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
            100,
        ],
        [
            "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\n\nBob: ",
            150,
        ],
        [
            '''```
def print_prime(n):
   """
   Print all primes between 1 and n
   """\n''',
            125,
        ],
    ],
    title="Microsoft Phi-2",
    description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
)


if __name__ == "__main__":
    demo.queue().launch(show_api=False)