ewftrhyjk commited on
Commit
f663d13
1 Parent(s): 2c0bdda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -1
app.py CHANGED
@@ -1,3 +1,105 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- gr.load("models/microsoft/phi-1").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ TextIteratorStreamer,
7
+ )
8
+ import os
9
+ from threading import Thread
10
+ import spaces
11
+ import time
12
+ import subprocess
13
 
14
+ subprocess.run(
15
+ "pip install flash-attn --no-build-isolation",
16
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
+ shell=True,
18
+ )
19
+
20
+ token = os.environ["HF_TOKEN"]
21
+
22
+
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ "microsoft/Phi-3-mini-128k-instruct",
25
+ token=token,
26
+ trust_remote_code=True,
27
+ )
28
+ tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
29
+ terminators = [
30
+ tok.eos_token_id,
31
+ ]
32
+
33
+ if torch.cuda.is_available():
34
+ device = torch.device("cuda")
35
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
36
+ else:
37
+ device = torch.device("cpu")
38
+ print("Using CPU")
39
+
40
+ model = model.to(device)
41
+ # Dispatch Errors
42
+
43
+
44
+ @spaces.GPU(duration=60)
45
+ def chat(message, history, temperature, do_sample, max_tokens):
46
+ chat = []
47
+ for item in history:
48
+ chat.append({"role": "user", "content": item[0]})
49
+ if item[1] is not None:
50
+ chat.append({"role": "assistant", "content": item[1]})
51
+ chat.append({"role": "user", "content": message})
52
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
53
+ model_inputs = tok([messages], return_tensors="pt").to(device)
54
+ streamer = TextIteratorStreamer(
55
+ tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
56
+ )
57
+ generate_kwargs = dict(
58
+ model_inputs,
59
+ streamer=streamer,
60
+ max_new_tokens=max_tokens,
61
+ do_sample=True,
62
+ temperature=temperature,
63
+ eos_token_id=terminators,
64
+ )
65
+
66
+ if temperature == 0:
67
+ generate_kwargs["do_sample"] = False
68
+
69
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
70
+ t.start()
71
+
72
+ partial_text = ""
73
+ for new_text in streamer:
74
+ partial_text += new_text
75
+ yield partial_text
76
+
77
+ yield partial_text
78
+
79
+
80
+ demo = gr.ChatInterface(
81
+ fn=chat,
82
+ examples=[["Write me a poem about Machine Learning."]],
83
+ # multimodal=False,
84
+ additional_inputs_accordion=gr.Accordion(
85
+ label="⚙️ Parameters", open=False, render=False
86
+ ),
87
+ additional_inputs=[
88
+ gr.Slider(
89
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
90
+ ),
91
+ gr.Checkbox(label="Sampling", value=True),
92
+ gr.Slider(
93
+ minimum=128,
94
+ maximum=4096,
95
+ step=1,
96
+ value=512,
97
+ label="Max new tokens",
98
+ render=False,
99
+ ),
100
+ ],
101
+ stop_btn="Stop Generation",
102
+ title="Chat With LLMs",
103
+ description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
104
+ )
105
+ demo.launch()