EdgeTA / utils /third_party /nni /tools /nnictl /tensorboard_utils.py
LINC-BIT's picture
Upload 1912 files
b84549f verified
raw
history blame
6.92 kB
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import json
import re
import tempfile
from subprocess import call, Popen
from .rest_utils import rest_get, check_rest_server_quick, check_response
from .config_utils import Config, Experiments
from .url_utils import trial_jobs_url, get_local_urls
from .constants import REST_TIME_OUT
from .common_utils import print_normal, print_error, print_green, detect_process, detect_port, check_tensorboard_version
from .nnictl_utils import check_experiment_id, check_experiment_id
from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local
def parse_log_path(args, trial_content):
'''parse log path'''
path_list = []
host_list = []
for trial in trial_content:
if args.trial_id and args.trial_id != 'all' and trial.get('id') != args.trial_id:
continue
pattern = r'(?P<head>.+)://(?P<host>.+):(?P<path>.*)'
match = re.search(pattern, trial['logPath'])
if match:
path_list.append(match.group('path'))
host_list.append(match.group('host'))
if not path_list:
print_error('Trial id %s error!' % args.trial_id)
exit(1)
return path_list, host_list
def copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, temp_nni_path):
'''use ssh client to copy data from remote machine to local machien'''
machine_list = nni_config.get_config('experimentConfig').get('machineList')
machine_dict = {}
local_path_list = []
for machine in machine_list:
machine_dict[machine['ip']] = {'port': machine['port'], 'passwd': machine['passwd'], 'username': machine['username'],
'sshKeyPath': machine.get('sshKeyPath'), 'passphrase': machine.get('passphrase')}
for index, host in enumerate(host_list):
local_path = os.path.join(temp_nni_path, trial_content[index].get('id'))
local_path_list.append(local_path)
print_normal('Copying log data from %s to %s' % (host + ':' + path_list[index], local_path))
sftp = create_ssh_sftp_client(host, machine_dict[host]['port'], machine_dict[host]['username'], machine_dict[host]['passwd'],
machine_dict[host]['sshKeyPath'], machine_dict[host]['passphrase'])
copy_remote_directory_to_local(sftp, path_list[index], local_path)
print_normal('Copy done!')
return local_path_list
def get_path_list(args, nni_config, trial_content, temp_nni_path):
'''get path list according to different platform'''
path_list, host_list = parse_log_path(args, trial_content)
platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform')
if platform == 'local':
print_normal('Log path: %s' % ' '.join(path_list))
return path_list
elif platform == 'remote':
path_list = copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, temp_nni_path)
print_normal('Log path: %s' % ' '.join(path_list))
return path_list
else:
print_error('Not supported platform!')
exit(1)
def format_tensorboard_log_path(path_list):
new_path_list = []
for index, value in enumerate(path_list):
new_path_list.append('name%d:%s' % (index + 1, value))
return ','.join(new_path_list)
def start_tensorboard_process(args, nni_config, path_list, temp_nni_path):
'''call cmds to start tensorboard process in local machine'''
if detect_port(args.port):
print_error('Port %s is used by another process, please reset port!' % str(args.port))
exit(1)
with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, \
open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file:
log_dir_cmd = '--logdir_spec' if check_tensorboard_version() >= '2.0' else '--logdir'
cmds = ['tensorboard', log_dir_cmd, format_tensorboard_log_path(path_list), '--port', str(args.port)]
tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file)
url_list = get_local_urls(args.port)
print_green('Start tensorboard success!')
print_normal('Tensorboard urls: ' + ' '.join(url_list))
tensorboard_process_pid_list = nni_config.get_config('tensorboardPidList')
if tensorboard_process_pid_list is None:
tensorboard_process_pid_list = [tensorboard_process.pid]
else:
tensorboard_process_pid_list.append(tensorboard_process.pid)
nni_config.set_config('tensorboardPidList', tensorboard_process_pid_list)
def stop_tensorboard(args):
'''stop tensorboard'''
experiment_id = check_experiment_id(args)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
config_file_name = experiment_dict[experiment_id]['fileName']
nni_config = Config(config_file_name)
tensorboard_pid_list = nni_config.get_config('tensorboardPidList')
if tensorboard_pid_list:
for tensorboard_pid in tensorboard_pid_list:
try:
cmds = ['kill', '-9', str(tensorboard_pid)]
call(cmds)
except Exception as exception:
print_error(exception)
nni_config.set_config('tensorboardPidList', [])
print_normal('Stop tensorboard success!')
else:
print_error('No tensorboard configuration!')
def start_tensorboard(args):
'''start tensorboard'''
experiment_id = check_experiment_id(args)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
config_file_name = experiment_dict[experiment_id]['fileName']
nni_config = Config(config_file_name)
rest_port = nni_config.get_config('restServerPort')
rest_pid = nni_config.get_config('restServerPid')
if not detect_process(rest_pid):
print_error('Experiment is not running...')
return
running, response = check_rest_server_quick(rest_port)
trial_content = None
if running:
response = rest_get(trial_jobs_url(rest_port), REST_TIME_OUT)
if response and check_response(response):
trial_content = json.loads(response.text)
else:
print_error('List trial failed...')
else:
print_error('Restful server is not running...')
if not trial_content:
print_error('No trial information!')
exit(1)
if len(trial_content) > 1 and not args.trial_id:
print_error('There are multiple trials, please set trial id!')
exit(1)
experiment_id = nni_config.get_config('experimentId')
temp_nni_path = os.path.join(tempfile.gettempdir(), 'nni', experiment_id)
os.makedirs(temp_nni_path, exist_ok=True)
path_list = get_path_list(args, nni_config, trial_content, temp_nni_path)
start_tensorboard_process(args, nni_config, path_list, temp_nni_path)