hysts HF staff commited on
Commit
400839c
1 Parent(s): 87968ab

Add system monitor

Browse files
Files changed (5) hide show
  1. Dockerfile +2 -0
  2. app_system_monitor.py +87 -0
  3. app_training.py +15 -4
  4. requirements-monitor.txt +4 -0
  5. trainer.py +13 -8
Dockerfile CHANGED
@@ -44,6 +44,8 @@ RUN pyenv install ${PYTHON_VERSION} && \
44
  RUN pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1
45
  COPY --chown=1000 requirements.txt /tmp/requirements.txt
46
  RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
 
 
47
 
48
  COPY --chown=1000 . ${HOME}/app
49
  RUN cd Tune-A-Video && patch -p1 < ../patch
44
  RUN pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1
45
  COPY --chown=1000 requirements.txt /tmp/requirements.txt
46
  RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
47
+ COPY --chown=1000 requirements-monitor.txt /tmp/requirements-monitor.txt
48
+ RUN pip install --no-cache-dir -U -r /tmp/requirements-monitor.txt
49
 
50
  COPY --chown=1000 . ${HOME}/app
51
  RUN cd Tune-A-Video && patch -p1 < ../patch
app_system_monitor.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ from __future__ import annotations
4
+
5
+ import collections
6
+
7
+ import gradio as gr
8
+ import nvitop
9
+ import pandas as pd
10
+ import plotly.express as px
11
+ import psutil
12
+
13
+
14
+ class SystemMonitor:
15
+ MAX_SIZE = 61
16
+
17
+ def __init__(self):
18
+ self.devices = nvitop.Device.all()
19
+ self.cpu_memory_usage = collections.deque(
20
+ [0 for _ in range(self.MAX_SIZE)], maxlen=self.MAX_SIZE)
21
+ self.cpu_memory_usage_str = ''
22
+ self.gpu_memory_usage = collections.deque(
23
+ [0 for _ in range(self.MAX_SIZE)], maxlen=self.MAX_SIZE)
24
+ self.gpu_util = collections.deque([0 for _ in range(self.MAX_SIZE)],
25
+ maxlen=self.MAX_SIZE)
26
+ self.gpu_memory_usage_str = ''
27
+ self.gpu_util_str = ''
28
+
29
+ def update(self) -> None:
30
+ self.update_cpu()
31
+ self.update_gpu()
32
+
33
+ def update_cpu(self) -> None:
34
+ memory = psutil.virtual_memory()
35
+ self.cpu_memory_usage.append(memory.percent)
36
+ self.cpu_memory_usage_str = f'{memory.used / 1024**3:0.2f}GiB / {memory.total / 1024**3:0.2f}GiB ({memory.percent}%)'
37
+
38
+ def update_gpu(self) -> None:
39
+ if not self.devices:
40
+ return
41
+ device = self.devices[0]
42
+ self.gpu_memory_usage.append(device.memory_percent())
43
+ self.gpu_util.append(device.gpu_utilization())
44
+ self.gpu_memory_usage_str = f'{device.memory_usage()} ({device.memory_percent()}%)'
45
+ self.gpu_util_str = f'{device.gpu_utilization()}%'
46
+
47
+ def get_json(self) -> dict[str, str]:
48
+ return {
49
+ 'CPU memory usage': self.cpu_memory_usage_str,
50
+ 'GPU memory usage': self.gpu_memory_usage_str,
51
+ 'GPU Util': self.gpu_util_str,
52
+ }
53
+
54
+ def get_graph_data(self) -> dict[str, list[int | float]]:
55
+ return {
56
+ 'index': list(range(-self.MAX_SIZE + 1, 1)),
57
+ 'CPU memory usage': self.cpu_memory_usage,
58
+ 'GPU memory usage': self.gpu_memory_usage,
59
+ 'GPU Util': self.gpu_util,
60
+ }
61
+
62
+ def get_graph(self):
63
+ df = pd.DataFrame(self.get_graph_data())
64
+ return px.line(df,
65
+ x='index',
66
+ y=[
67
+ 'CPU memory usage',
68
+ 'GPU memory usage',
69
+ 'GPU Util',
70
+ ],
71
+ range_y=[-5,
72
+ 105]).update_layout(xaxis_title='Time',
73
+ yaxis_title='Percentage')
74
+
75
+
76
+ def create_monitor_demo() -> gr.Blocks:
77
+ monitor = SystemMonitor()
78
+ with gr.Blocks() as demo:
79
+ gr.JSON(value=monitor.update, every=1, visible=False)
80
+ gr.JSON(value=monitor.get_json, show_label=False, every=1)
81
+ gr.Plot(value=monitor.get_graph, show_label=False, every=1)
82
+ return demo
83
+
84
+
85
+ if __name__ == '__main__':
86
+ demo = create_monitor_demo()
87
+ demo.queue(api_open=False).launch()
app_training.py CHANGED
@@ -6,6 +6,7 @@ import os
6
 
7
  import gradio as gr
8
 
 
9
  from constants import UploadTarget
10
  from inference import InferencePipeline
11
  from trainer import Trainer
@@ -13,6 +14,11 @@ from trainer import Trainer
13
 
14
  def create_training_demo(trainer: Trainer,
15
  pipe: InferencePipeline | None = None) -> gr.Blocks:
 
 
 
 
 
16
  hf_token = os.getenv('HF_TOKEN')
17
  with gr.Blocks() as demo:
18
  with gr.Row():
@@ -108,8 +114,14 @@ def create_training_demo(trainer: Trainer,
108
  run_button = gr.Button('Start Training')
109
 
110
  with gr.Box():
111
- gr.Markdown('Output message')
112
- output_message = gr.Markdown()
 
 
 
 
 
 
113
 
114
  if pipe is not None:
115
  run_button.click(fn=pipe.clear)
@@ -136,8 +148,7 @@ def create_training_demo(trainer: Trainer,
136
  upload_to,
137
  remove_gpu_after_training,
138
  input_token,
139
- ],
140
- outputs=output_message)
141
  return demo
142
 
143
 
6
 
7
  import gradio as gr
8
 
9
+ from app_system_monitor import create_monitor_demo
10
  from constants import UploadTarget
11
  from inference import InferencePipeline
12
  from trainer import Trainer
14
 
15
  def create_training_demo(trainer: Trainer,
16
  pipe: InferencePipeline | None = None) -> gr.Blocks:
17
+ def read_log() -> str:
18
+ with open(trainer.log_file) as f:
19
+ lines = f.readlines()
20
+ return ''.join(lines[-10:])
21
+
22
  hf_token = os.getenv('HF_TOKEN')
23
  with gr.Blocks() as demo:
24
  with gr.Row():
114
  run_button = gr.Button('Start Training')
115
 
116
  with gr.Box():
117
+ gr.Text(label='Log',
118
+ value=read_log,
119
+ lines=10,
120
+ max_lines=10,
121
+ every=1)
122
+ if not os.getenv('DISABLE_SYSTEM_MONITOR'):
123
+ with gr.Accordion(label='System info', open=False):
124
+ create_monitor_demo()
125
 
126
  if pipe is not None:
127
  run_button.click(fn=pipe.clear)
148
  upload_to,
149
  remove_gpu_after_training,
150
  input_token,
151
+ ])
 
152
  return demo
153
 
154
 
requirements-monitor.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ nvitop==1.1.1
2
+ pandas==2.0.0
3
+ plotly==5.14.1
4
+ psutil==5.9.4
trainer.py CHANGED
@@ -32,6 +32,9 @@ class Trainer:
32
  self.checkpoint_dir = pathlib.Path('checkpoints')
33
  self.checkpoint_dir.mkdir(exist_ok=True)
34
 
 
 
 
35
  def download_base_model(self, base_model_id: str) -> str:
36
  model_dir = self.checkpoint_dir / base_model_id
37
  if not model_dir.exists():
@@ -72,7 +75,7 @@ class Trainer:
72
  upload_to: str,
73
  remove_gpu_after_training: bool,
74
  input_token: str,
75
- ) -> str:
76
  if SPACE_ID == ORIGINAL_SPACE_ID:
77
  raise gr.Error(
78
  'This Space does not work on this Shared UI. Duplicate the Space and attribute a GPU'
@@ -134,15 +137,19 @@ class Trainer:
134
  OmegaConf.save(config, f)
135
 
136
  command = f'accelerate launch Tune-A-Video/train_tuneavideo.py --config {config_path}'
137
- subprocess.run(shlex.split(command))
 
 
 
 
138
  save_model_card(save_dir=output_dir,
139
  base_model=base_model,
140
  training_prompt=training_prompt,
141
  test_prompt=validation_prompt,
142
  test_image_dir='samples')
143
 
144
- message = 'Training completed!'
145
- print(message)
146
 
147
  if upload_to_hub:
148
  upload_message = self.model_uploader.upload_model(
@@ -152,8 +159,8 @@ class Trainer:
152
  private=use_private_repo,
153
  delete_existing_repo=delete_existing_repo,
154
  input_token=input_token)
155
- print(upload_message)
156
- message = message + '\n' + upload_message
157
 
158
  if remove_gpu_after_training:
159
  space_id = os.getenv('SPACE_ID')
@@ -162,5 +169,3 @@ class Trainer:
162
  token=self.hf_token if self.hf_token else input_token)
163
  api.request_space_hardware(repo_id=space_id,
164
  hardware='cpu-basic')
165
-
166
- return message
32
  self.checkpoint_dir = pathlib.Path('checkpoints')
33
  self.checkpoint_dir.mkdir(exist_ok=True)
34
 
35
+ self.log_file = pathlib.Path('log.txt')
36
+ self.log_file.touch(exist_ok=True)
37
+
38
  def download_base_model(self, base_model_id: str) -> str:
39
  model_dir = self.checkpoint_dir / base_model_id
40
  if not model_dir.exists():
75
  upload_to: str,
76
  remove_gpu_after_training: bool,
77
  input_token: str,
78
+ ) -> None:
79
  if SPACE_ID == ORIGINAL_SPACE_ID:
80
  raise gr.Error(
81
  'This Space does not work on this Shared UI. Duplicate the Space and attribute a GPU'
137
  OmegaConf.save(config, f)
138
 
139
  command = f'accelerate launch Tune-A-Video/train_tuneavideo.py --config {config_path}'
140
+ with open(self.log_file, 'w') as f:
141
+ subprocess.run(shlex.split(command),
142
+ stdout=f,
143
+ stderr=subprocess.STDOUT,
144
+ text=True)
145
  save_model_card(save_dir=output_dir,
146
  base_model=base_model,
147
  training_prompt=training_prompt,
148
  test_prompt=validation_prompt,
149
  test_image_dir='samples')
150
 
151
+ with open(self.log_file, 'a') as f:
152
+ f.write('Training completed!\n')
153
 
154
  if upload_to_hub:
155
  upload_message = self.model_uploader.upload_model(
159
  private=use_private_repo,
160
  delete_existing_repo=delete_existing_repo,
161
  input_token=input_token)
162
+ with open(self.log_file, 'a') as f:
163
+ f.write(upload_message)
164
 
165
  if remove_gpu_after_training:
166
  space_id = os.getenv('SPACE_ID')
169
  token=self.hf_token if self.hf_token else input_token)
170
  api.request_space_hardware(repo_id=space_id,
171
  hardware='cpu-basic')