File size: 4,434 Bytes
c04565c
 
 
 
 
1aedf4d
7d1a481
 
a12584a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c04565c
 
66f2ff3
926026b
c04565c
 
 
 
 
 
 
 
7d1a481
a12584a
dbcc915
c04565c
 
 
 
 
 
 
 
7d1a481
c04565c
dbcc915
 
7d1a481
c04565c
 
 
 
 
 
 
 
 
 
a12584a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93bff60
a12584a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
from transformers import TextIteratorStreamer
from threading import Thread
from transformers import StoppingCriteria, StoppingCriteriaList
import torch
import spaces
import os

model_name1 = "microsoft/Phi-3-medium-4k-instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer
model1 = AutoModelForCausalLM.from_pretrained(model_name1, device_map='cuda', torch_dtype=torch.float16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name1)

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [29, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

@spaces.GPU(duration=20, queue=False)
def predict1(message, history, temperature, max_tokens, top_p, top_k):
    history_transformer_format = history + [[message, ""]]
    stop = StopOnTokens()
    messages = "".join(["".join(["\n<|end|>\n<|user|>\n"+item[0], "\n<|end|>\n<|assistant|>\n"+item[1]]) for item in history_transformer_format])
    model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        stopping_criteria=StoppingCriteriaList([stop])
    )
    t = Thread(target=model1.generate, kwargs=generate_kwargs)
    t.start()
    partial_message = ""
    for new_token in streamer:
        if new_token != '<':
            partial_message += new_token
        yield partial_message

model_name = "microsoft/Phi-3-medium-128k-instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda', torch_dtype=torch.float16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [29, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

@spaces.GPU(duration=40, queue=False)
def predict(message, history, temperature, max_tokens, top_p, top_k):
    history_transformer_format = history + [[message, ""]]
    stop = StopOnTokens()
    messages = "".join(["".join(["\n<|end|>\n<|user|>\n"+item[0], "\n<|end|>\n<|assistant|>\n"+item[1]]) for item in history_transformer_format])
    model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        stopping_criteria=StoppingCriteriaList([stop])
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    partial_message = ""
    for new_token in streamer:
        if new_token != '<':
            partial_message += new_token
        yield partial_message

with gr.Blocks() as min:  
    gr.ChatInterface(
        fn=predict1,
        title="Phi-3-medium-4k-instruct",
        additional_inputs=[
            gr.Slider(0.1, 0.9, value=0.7, label="Temperature"),
            gr.Slider(512, 4096, value=4096, label="Max Tokens"),
            gr.Slider(0.1, 0.9, value=0.7, label="top_p"),
            gr.Slider(10, 90, value=40, label="top_k"),
        ]
    )
    
    
with gr.Blocks() as max:
    gr.ChatInterface(
        fn=predict,
        title="Phi-3-medium-128k-instruct",
        additional_inputs=[
            gr.Slider(0.1, 0.9, value=0.7, label="Temperature"),
            gr.Slider(64000, 128000, value=100000, label="Max Tokens"),
            gr.Slider(0.1, 0.9, value=0.7, label="top_p"),
            gr.Slider(10, 90, value=40, label="top_k"),
        ]
    )

with gr.Blocks(title="Phi 3 Medium DEMO") as demo:
    gr.Markdown("# Phi3 Medium all in one")
    gr.TabbedInterface([max, min], ['Phi3 medium 128k','Phi3 medium 4k'])
        
demo.launch(share=True)