JUNGU commited on
Commit
33cc946
โ€ข
1 Parent(s): c662eeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -10
app.py CHANGED
@@ -11,25 +11,30 @@ import spaces
11
  import time
12
  import subprocess
13
 
 
14
  subprocess.run(
15
  "pip install flash-attn --no-build-isolation",
16
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
  shell=True,
18
  )
19
 
 
20
  token = os.environ["HF_TOKEN"]
21
 
22
-
23
  model = AutoModelForCausalLM.from_pretrained(
24
  "microsoft/Phi-3-mini-128k-instruct",
25
  token=token,
26
  trust_remote_code=True,
27
  )
28
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
 
 
29
  terminators = [
30
  tok.eos_token_id,
31
  ]
32
 
 
33
  if torch.cuda.is_available():
34
  device = torch.device("cuda")
35
  print(f"Using GPU: {torch.cuda.get_device_name(device)}")
@@ -38,37 +43,46 @@ else:
38
  print("Using CPU")
39
 
40
  model = model.to(device)
41
- # Dispatch Errors
42
-
43
 
 
44
  @spaces.GPU(duration=60)
45
  def chat(message, history, temperature, do_sample, max_tokens):
 
46
  chat = []
47
  for item in history:
48
  chat.append({"role": "user", "content": item[0]})
49
  if item[1] is not None:
50
  chat.append({"role": "assistant", "content": item[1]})
51
  chat.append({"role": "user", "content": message})
 
 
52
  messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
53
  model_inputs = tok([messages], return_tensors="pt").to(device)
 
 
54
  streamer = TextIteratorStreamer(
55
  tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
56
  )
 
 
57
  generate_kwargs = dict(
58
  model_inputs,
59
  streamer=streamer,
60
- max_new_tokens=max_tokens,
61
- do_sample=True,
62
- temperature=temperature,
63
- eos_token_id=terminators,
64
  )
65
 
 
66
  if temperature == 0:
67
  generate_kwargs["do_sample"] = False
68
 
 
69
  t = Thread(target=model.generate, kwargs=generate_kwargs)
70
  t.start()
71
 
 
72
  partial_text = ""
73
  for new_text in streamer:
74
  partial_text += new_text
@@ -76,11 +90,10 @@ def chat(message, history, temperature, do_sample, max_tokens):
76
 
77
  yield partial_text
78
 
79
-
80
  demo = gr.ChatInterface(
81
  fn=chat,
82
  examples=[["Write me a poem about Machine Learning."]],
83
- # multimodal=False,
84
  additional_inputs_accordion=gr.Accordion(
85
  label="โš™๏ธ Parameters", open=False, render=False
86
  ),
@@ -102,4 +115,6 @@ demo = gr.ChatInterface(
102
  title="Chat With LLMs",
103
  description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
104
  )
105
- demo.launch()
 
 
 
11
  import time
12
  import subprocess
13
 
14
+ # flash-attn ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜. CUDA ๋นŒ๋“œ๋Š” ๊ฑด๋„ˆ๋œ€.
15
  subprocess.run(
16
  "pip install flash-attn --no-build-isolation",
17
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
18
  shell=True,
19
  )
20
 
21
+ # Hugging Face ํ† ํฐ ๊ฐ€์ ธ์˜ค๊ธฐ
22
  token = os.environ["HF_TOKEN"]
23
 
24
+ # microsoft/Phi-3-mini-128k-instruct ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
25
  model = AutoModelForCausalLM.from_pretrained(
26
  "microsoft/Phi-3-mini-128k-instruct",
27
  token=token,
28
  trust_remote_code=True,
29
  )
30
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
31
+
32
+ # ์ข…๋ฃŒ ํ† ํฐ ID ์„ค์ •
33
  terminators = [
34
  tok.eos_token_id,
35
  ]
36
 
37
+ # GPU๊ฐ€ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๊ฒฝ์šฐ GPU๋กœ, ์•„๋‹ˆ๋ฉด CPU๋กœ ๋ชจ๋ธ ๋กœ๋“œ
38
  if torch.cuda.is_available():
39
  device = torch.device("cuda")
40
  print(f"Using GPU: {torch.cuda.get_device_name(device)}")
 
43
  print("Using CPU")
44
 
45
  model = model.to(device)
 
 
46
 
47
+ # Spaces์˜ GPU ์ž์›์„ ์‚ฌ์šฉํ•˜์—ฌ chat ํ•จ์ˆ˜ ์‹คํ–‰. ์ตœ๋Œ€ 60์ดˆ ๋™์•ˆ GPU ์ž์› ์‚ฌ์šฉ ๊ฐ€๋Šฅ.
48
  @spaces.GPU(duration=60)
49
  def chat(message, history, temperature, do_sample, max_tokens):
50
+ # ์ฑ„ํŒ… ๊ธฐ๋ก์„ ์ ์ ˆํ•œ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
51
  chat = []
52
  for item in history:
53
  chat.append({"role": "user", "content": item[0]})
54
  if item[1] is not None:
55
  chat.append({"role": "assistant", "content": item[1]})
56
  chat.append({"role": "user", "content": message})
57
+
58
+ # ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ž…๋ ฅ ์ฒ˜๋ฆฌ
59
  messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
60
  model_inputs = tok([messages], return_tensors="pt").to(device)
61
+
62
+ # TextIteratorStreamer๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ชจ๋ธ ์ถœ๋ ฅ ์ŠคํŠธ๋ฆฌ๋ฐ
63
  streamer = TextIteratorStreamer(
64
  tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
65
  )
66
+
67
+ # ์ƒ์„ฑ ๊ด€๋ จ ๋งค๊ฐœ๋ณ€์ˆ˜ ์„ค์ •
68
  generate_kwargs = dict(
69
  model_inputs,
70
  streamer=streamer,
71
+ max_new_tokens=max_tokens, # ์ƒ์„ฑํ•  ์ตœ๋Œ€ ์ƒˆ ํ† ํฐ ์ˆ˜
72
+ do_sample=True, # ์ƒ˜ํ”Œ๋ง ์—ฌ๋ถ€
73
+ temperature=temperature, # ์˜จ๋„ ๋งค๊ฐœ๋ณ€์ˆ˜. ๋†’์„์ˆ˜๋ก ๋‹ค์–‘์„ฑ ์ฆ๊ฐ€
74
+ eos_token_id=terminators, # ์ข…๋ฃŒ ํ† ํฐ ID
75
  )
76
 
77
+ # ์˜จ๋„๊ฐ€ 0์ด๋ฉด ์ƒ˜ํ”Œ๋งํ•˜์ง€ ์•Š์Œ
78
  if temperature == 0:
79
  generate_kwargs["do_sample"] = False
80
 
81
+ # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ๋ชจ๋ธ ์ƒ์„ฑ ์‹œ์ž‘
82
  t = Thread(target=model.generate, kwargs=generate_kwargs)
83
  t.start()
84
 
85
+ # ์ƒ์„ฑ๋œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜๋ณต์ ์œผ๋กœ yield
86
  partial_text = ""
87
  for new_text in streamer:
88
  partial_text += new_text
 
90
 
91
  yield partial_text
92
 
93
+ # Gradio์˜ ChatInterface๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋Œ€ํ™”ํ˜• ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
94
  demo = gr.ChatInterface(
95
  fn=chat,
96
  examples=[["Write me a poem about Machine Learning."]],
 
97
  additional_inputs_accordion=gr.Accordion(
98
  label="โš™๏ธ Parameters", open=False, render=False
99
  ),
 
115
  title="Chat With LLMs",
116
  description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
117
  )
118
+
119
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์‹คํ–‰
120
+ demo.launch()