camenduru
/

thrust

Model card Files Files and versions Community

thrust / ci /common /memmon.py

camenduru

thanks to nvidia ❤

8ae5fc5 about 2 years ago

raw

history blame contribute delete

3.76 kB

	#! /usr/bin/env python

	# Copyright (c) 2022 NVIDIA Corporation
	# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	# Released under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.

	help_text = """%(prog)s [reference.json compare.json \| reference_dir/ compare_dir/]

	This script:

	1. Runs `top -bco RES`, continuously extracting the memory usage of each process.
	2. If a process uses more than `log_threshold` GiB and exceeds any other recorded
	entry for the process, it is stored in `entries`.
	3. When this script receives SIGINT, it writes two files:
	* `log_file` will contain all recorded max-memory-per-process entries
	* `fail_file` will contain all entries that exceed `fail_threshold`
	"""

	import argparse
	import os
	import re
	import signal
	import sys

	from subprocess import Popen, PIPE, STDOUT

	parser = argparse.ArgumentParser(prog='memmon.py', usage=help_text)
	parser.add_argument('--log-threshold', type=float, dest='log_threshold',
	default=0.5,
	help='Logging threshold in GiB.')
	parser.add_argument('--fail-threshold', type=float, dest='fail_threshold',
	default=2,
	help='Failure threshold in GiB.')
	parser.add_argument('--log-file', type=str, dest='log_file', default='memmon_log',
	help='Output file for log entries.')
	args, unused = parser.parse_known_args()

	entries = {}


	def signal_handler(sig, frame):
	# Sort by mem:
	sortentries = sorted(entries.items(), key=lambda x: x[1], reverse=True)

	lf = open(args.log_file, "w")

	for com, mem in sortentries:
	status = "PASS"
	if mem >= args.fail_threshold:
	status = "FAIL"
	line = "%4s \| %3.1f GiB \| %s\n" % (status, mem, com)
	lf.write(line)

	lf.close()
	sys.exit(0)


	signal.signal(signal.SIGINT, signal_handler)

	# Find the toprc config file and configure top's env.
	# This config:
	# - Hides all columns except for RES and COMMAND
	# - Sorts by RES
	# - Enables long command strings (-c)
	script_dir = os.path.dirname(os.path.realpath(__file__))
	config_dir = os.path.join(script_dir, 'memmon_config')

	proc = Popen(["top", "-b", "-w", "512"],
	stdin=PIPE, stdout=PIPE, stderr=STDOUT,
	env={"XDG_CONFIG_HOME": config_dir})

	regex = re.compile("^\\s([0-9.]+[kmgtp]?)\\s+(.+)\\s$")


	# Convert a memory string from top into floating point GiB
	def parse_mem(mem_str):
	if mem_str[-1] == "k":
	return float(mem_str[:-1]) / (1024 * 1024)
	elif mem_str[-1] == "m":
	return float(mem_str[:-1]) / (1024)
	elif mem_str[-1] == "g":
	return float(mem_str[:-1])
	elif mem_str[-1] == "t":
	return float(mem_str[:-1]) * 1024
	elif mem_str[-1] == "p": # please no
	return float(mem_str[:-1]) * 1024 * 1024
	# bytes:
	return float(mem_str) / (1024 * 1024 * 1024)


	for line in proc.stdout:
	line = line.decode()
	match = regex.match(line)
	if match:
	mem = parse_mem(match.group(1))
	if mem < args.log_threshold and mem < args.fail_threshold:
	continue
	com = match.group(2)
	if com in entries and entries[com] > mem:
	continue
	if mem >= args.fail_threshold:
	# Print a notice immediately -- this helps identify the failures
	# as they happen, since `com` may not provide enough info.
	print("memmon.py failure: Build step exceed memory threshold:\n"
	" - Threshold: %3.1f GiB\n"
	" - Usage: %3.1f GiB\n"
	" - Command: %s" % (args.fail_threshold, mem, com))
	entries[com] = mem