Spaces:
Runtime error
Runtime error
meg-huggingface
commited on
Commit
·
939c209
1
Parent(s):
b891a6a
Adding Traceback handling
Browse files- Dockerfile +1 -2
- create_results.py +0 -39
- entrypoint.sh +18 -9
- process_runs.py +111 -0
Dockerfile
CHANGED
|
@@ -21,7 +21,6 @@ RUN mkdir -p .cache
|
|
| 21 |
RUN chmod 777 -R .cache
|
| 22 |
#RUN chmod 777 -R data
|
| 23 |
|
| 24 |
-
|
| 25 |
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
| 26 |
build-essential \
|
| 27 |
ca-certificates \
|
|
@@ -61,7 +60,7 @@ COPY ./.cache /.cache
|
|
| 61 |
COPY ./entrypoint.sh /entrypoint.sh
|
| 62 |
COPY ./pause_space.py /pause_space.py
|
| 63 |
COPY ./parse_requests.py /parse_requests.py
|
| 64 |
-
COPY ./create_results.py /
|
| 65 |
COPY ./runs /runs
|
| 66 |
COPY ./attempts.txt /attempts.txt
|
| 67 |
COPY ./failed_attempts.txt /failed_attempts.txt
|
|
|
|
| 21 |
RUN chmod 777 -R .cache
|
| 22 |
#RUN chmod 777 -R data
|
| 23 |
|
|
|
|
| 24 |
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
| 25 |
build-essential \
|
| 26 |
ca-certificates \
|
|
|
|
| 60 |
COPY ./entrypoint.sh /entrypoint.sh
|
| 61 |
COPY ./pause_space.py /pause_space.py
|
| 62 |
COPY ./parse_requests.py /parse_requests.py
|
| 63 |
+
COPY ./create_results.py /process_runs.py
|
| 64 |
COPY ./runs /runs
|
| 65 |
COPY ./attempts.txt /attempts.txt
|
| 66 |
COPY ./failed_attempts.txt /failed_attempts.txt
|
create_results.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
from datasets import load_dataset, Dataset
|
| 4 |
-
from huggingface_hub import HfApi
|
| 5 |
-
import pandas as pd
|
| 6 |
-
|
| 7 |
-
TOKEN = os.environ.get("DEBUG")
|
| 8 |
-
|
| 9 |
-
api = HfApi(token=TOKEN)
|
| 10 |
-
|
| 11 |
-
out_dir = sys.argv[1]
|
| 12 |
-
all_attempts_read = open("attempts.txt", "r+").readlines()
|
| 13 |
-
failed_attempts_read = open("failed_attempts.txt", "r+").readlines()
|
| 14 |
-
|
| 15 |
-
# Uploading output to the results dataset.
|
| 16 |
-
api.upload_folder(
|
| 17 |
-
folder_path=out_dir,
|
| 18 |
-
repo_id="AIEnergyScore/results_debug",
|
| 19 |
-
repo_type="dataset",
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
# Updating requests
|
| 23 |
-
requests = load_dataset("AIEnergyScore/requests_debug", split="test",
|
| 24 |
-
token=TOKEN)
|
| 25 |
-
requests_dset = requests.to_pandas()
|
| 26 |
-
|
| 27 |
-
for line in all_attempts_read:
|
| 28 |
-
experiment_name, model = line.strip().split(',')
|
| 29 |
-
if line not in failed_attempts_read:
|
| 30 |
-
requests_dset.loc[
|
| 31 |
-
requests_dset["model"] == model, ['status']] = "COMPLETED"
|
| 32 |
-
else:
|
| 33 |
-
requests_dset.loc[
|
| 34 |
-
requests_dset["model"] == model, ['status']] = "FAILED"
|
| 35 |
-
|
| 36 |
-
updated_dset = Dataset.from_pandas(requests_dset)
|
| 37 |
-
updated_dset.push_to_hub("AIEnergyScore/requests_debug", split="test",
|
| 38 |
-
token=TOKEN)
|
| 39 |
-
print("Updated model status")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
entrypoint.sh
CHANGED
|
@@ -1,27 +1,36 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
|
|
|
| 3 |
export SPACE="AIEnergyScore/launch-computation-example"
|
| 4 |
|
| 5 |
-
echo "
|
|
|
|
|
|
|
| 6 |
|
|
|
|
| 7 |
# For each line in the requests dataset....
|
| 8 |
python /parse_requests.py | while read -r line; do
|
| 9 |
-
# Read the name of the model and the experiment.
|
| 10 |
-
IFS="," read
|
| 11 |
-
echo "Benchmarking Model: ${
|
| 12 |
|
| 13 |
# Initialize the directory for output.
|
| 14 |
now=$(date +%Y-%m-%d-%H-%M-%S)
|
| 15 |
-
run_dir="
|
| 16 |
mkdir -p "$run_dir"
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
done
|
| 22 |
|
| 23 |
echo "Finished; updating requests dataset and results dataset."
|
| 24 |
-
python /
|
| 25 |
|
| 26 |
# Pausing space
|
| 27 |
echo "Pausing space."
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
# TODO: Why is this here? Can we delete it?
|
| 4 |
export SPACE="AIEnergyScore/launch-computation-example"
|
| 5 |
|
| 6 |
+
# Can use this for errors too: trap 'echo "An error occurred."' ERR
|
| 7 |
+
|
| 8 |
+
config_dir="/optimum-benchmark/examples/energy_star/"
|
| 9 |
|
| 10 |
+
echo "Attempting to run."
|
| 11 |
# For each line in the requests dataset....
|
| 12 |
python /parse_requests.py | while read -r line; do
|
| 13 |
+
# Read the name of the model and the experiment (task).
|
| 14 |
+
IFS="," read model task <<< "${line}"
|
| 15 |
+
echo "Benchmarking Model: ${model}, Task: ${task}"
|
| 16 |
|
| 17 |
# Initialize the directory for output.
|
| 18 |
now=$(date +%Y-%m-%d-%H-%M-%S)
|
| 19 |
+
run_dir="/runs/${task}/${model}/${now}"
|
| 20 |
mkdir -p "$run_dir"
|
| 21 |
+
# Save the task/model run directory to text file, for tracking purposes.
|
| 22 |
+
echo "${run_dir}" >> /attempts.txt
|
| 23 |
|
| 24 |
+
{ # try
|
| 25 |
+
# Let the benchmarking begin!
|
| 26 |
+
optimum-benchmark --config-name "${task}" --config-dir="${config_dir}" backend.model="${model}" backend.processor="${model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log"
|
| 27 |
+
} || { # catch
|
| 28 |
+
echo "${run_dir}" >> /failed_attempts.txt
|
| 29 |
+
}
|
| 30 |
done
|
| 31 |
|
| 32 |
echo "Finished; updating requests dataset and results dataset."
|
| 33 |
+
python /process_runs.py
|
| 34 |
|
| 35 |
# Pausing space
|
| 36 |
echo "Pausing space."
|
process_runs.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
from datasets import load_dataset, Dataset
|
| 4 |
+
from huggingface_hub import HfApi
|
| 5 |
+
|
| 6 |
+
TOKEN = os.environ.get("DEBUG")
|
| 7 |
+
api = HfApi(token=TOKEN)
|
| 8 |
+
|
| 9 |
+
REQUESTS_DSET = "AIEnergyScore/requests_debug"
|
| 10 |
+
RESULTS_DSET = "AIEnergyScore/results_debug"
|
| 11 |
+
PENDING = 'PENDING'
|
| 12 |
+
COMPLETED = 'COMPLETED'
|
| 13 |
+
FAILED = 'FAILED'
|
| 14 |
+
|
| 15 |
+
def parse_args():
|
| 16 |
+
parser = argparse.ArgumentParser()
|
| 17 |
+
parser.add_argument(
|
| 18 |
+
"--run_dir",
|
| 19 |
+
default="/runs",
|
| 20 |
+
type=str,
|
| 21 |
+
required=False,
|
| 22 |
+
help="Path to the run directory.",
|
| 23 |
+
)
|
| 24 |
+
parser.add_argument(
|
| 25 |
+
"--attempts",
|
| 26 |
+
default="/attempts.txt",
|
| 27 |
+
type=str,
|
| 28 |
+
required=False,
|
| 29 |
+
help="File with per-line run attempt directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
|
| 30 |
+
)
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--failed_attempts",
|
| 33 |
+
default="/failed_attempts.txt",
|
| 34 |
+
type=str,
|
| 35 |
+
required=False,
|
| 36 |
+
help="File with per-line failed run directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
|
| 37 |
+
)
|
| 38 |
+
args = parser.parse_args()
|
| 39 |
+
return args
|
| 40 |
+
|
| 41 |
+
def check_for_traceback(run_dir):
|
| 42 |
+
# run_dir="./runs/${experiment_name}/${backend_model}/${now}"
|
| 43 |
+
found_error = False
|
| 44 |
+
error_message = ""
|
| 45 |
+
try:
|
| 46 |
+
# Read error message
|
| 47 |
+
with open(f"{run_dir}/error.log", 'r') as f:
|
| 48 |
+
# There may be a better way to do this that finds the
|
| 49 |
+
# index of Traceback, then prints from there : end-of-file index (the file length-1).
|
| 50 |
+
for line in f:
|
| 51 |
+
# Question: Do we even need to check for this? The presence of the
|
| 52 |
+
# error file, or at least a non-empty one,
|
| 53 |
+
# means there's been an error, no?
|
| 54 |
+
if 'Traceback (most recent call last):' in line:
|
| 55 |
+
found_error = True
|
| 56 |
+
if found_error:
|
| 57 |
+
error_message += line
|
| 58 |
+
except FileNotFoundError as e:
|
| 59 |
+
# When does this happen?
|
| 60 |
+
print(f"Could not find {run_dir}/error.log")
|
| 61 |
+
return error_message
|
| 62 |
+
|
| 63 |
+
def update_requests(requests, all_attempts, failed_attempts):
|
| 64 |
+
"""
|
| 65 |
+
Sets All PENDING requests with the given model & task to 'COMPLETED' or 'FAILED.'
|
| 66 |
+
Reads in the all_attempts text file and failed_attempts text file, in which
|
| 67 |
+
each line is a run directory run_dir="/runs/${experiment_name}/${backend_model}/${now}"
|
| 68 |
+
|
| 69 |
+
:param requests: requests Dataset
|
| 70 |
+
:param all_attempts: text file of the run directories of each task/model/timestamp
|
| 71 |
+
:param failed_attempts: text file of the run directories of each task/model/timestamp
|
| 72 |
+
:return:
|
| 73 |
+
"""
|
| 74 |
+
requests_df = requests.to_pandas()
|
| 75 |
+
# Each line is a run directory, where
|
| 76 |
+
# run_dir="/runs/${experiment_name}/${backend_model}/${now}"
|
| 77 |
+
for line in all_attempts:
|
| 78 |
+
split_run_dir = line.strip().split("/")
|
| 79 |
+
task = split_run_dir[1]
|
| 80 |
+
model = split_run_dir[2]
|
| 81 |
+
if line not in failed_attempts:
|
| 82 |
+
traceback_error = check_for_traceback(line)
|
| 83 |
+
if traceback_error != "":
|
| 84 |
+
print("Found a traceback error!")
|
| 85 |
+
print(traceback_error)
|
| 86 |
+
requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = FAILED
|
| 87 |
+
requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['error_message']] = traceback_error
|
| 88 |
+
else:
|
| 89 |
+
requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = COMPLETED
|
| 90 |
+
updated_dset = Dataset.from_pandas(requests_df)
|
| 91 |
+
return updated_dset
|
| 92 |
+
|
| 93 |
+
if __name__ == '__main__':
|
| 94 |
+
args = parse_args()
|
| 95 |
+
# Uploads all run output to the results dataset.
|
| 96 |
+
print(f"Uploading {args.run_dir} to {RESULTS_DSET}")
|
| 97 |
+
api.upload_folder(
|
| 98 |
+
folder_path=args.run_dir,
|
| 99 |
+
repo_id=f"{RESULTS_DSET}",
|
| 100 |
+
repo_type="dataset",
|
| 101 |
+
)
|
| 102 |
+
# Update requests dataset based on whether things have failed or not.
|
| 103 |
+
print(f"Examining the run directory for each model & task to determine if it {FAILED} or {COMPLETED}.")
|
| 104 |
+
print(f"Setting the corresponding line in {REQUESTS_DSET} to {FAILED} or {COMPLETED} based on what's in the directory.")
|
| 105 |
+
requests = load_dataset(f"{REQUESTS_DSET}", split="test", token=TOKEN)
|
| 106 |
+
all_attempts = open(f"{args.attempts}", "r+").readlines()
|
| 107 |
+
failed_attempts = open(f"{args.failed_attempts}", "r+").readlines()
|
| 108 |
+
updated_requests = update_requests(requests, all_attempts, failed_attempts)
|
| 109 |
+
print(f"Uploading updated {REQUESTS_DSET}.")
|
| 110 |
+
updated_requests.push_to_hub(f"{REQUESTS_DSET}", split="test", token=TOKEN)
|
| 111 |
+
print("Done.")
|