meg-huggingface
commited on
Commit
•
939c209
1
Parent(s):
b891a6a
Adding Traceback handling
Browse files- Dockerfile +1 -2
- create_results.py +0 -39
- entrypoint.sh +18 -9
- process_runs.py +111 -0
Dockerfile
CHANGED
@@ -21,7 +21,6 @@ RUN mkdir -p .cache
|
|
21 |
RUN chmod 777 -R .cache
|
22 |
#RUN chmod 777 -R data
|
23 |
|
24 |
-
|
25 |
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
26 |
build-essential \
|
27 |
ca-certificates \
|
@@ -61,7 +60,7 @@ COPY ./.cache /.cache
|
|
61 |
COPY ./entrypoint.sh /entrypoint.sh
|
62 |
COPY ./pause_space.py /pause_space.py
|
63 |
COPY ./parse_requests.py /parse_requests.py
|
64 |
-
COPY ./create_results.py /
|
65 |
COPY ./runs /runs
|
66 |
COPY ./attempts.txt /attempts.txt
|
67 |
COPY ./failed_attempts.txt /failed_attempts.txt
|
|
|
21 |
RUN chmod 777 -R .cache
|
22 |
#RUN chmod 777 -R data
|
23 |
|
|
|
24 |
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
25 |
build-essential \
|
26 |
ca-certificates \
|
|
|
60 |
COPY ./entrypoint.sh /entrypoint.sh
|
61 |
COPY ./pause_space.py /pause_space.py
|
62 |
COPY ./parse_requests.py /parse_requests.py
|
63 |
+
COPY ./create_results.py /process_runs.py
|
64 |
COPY ./runs /runs
|
65 |
COPY ./attempts.txt /attempts.txt
|
66 |
COPY ./failed_attempts.txt /failed_attempts.txt
|
create_results.py
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import sys
|
3 |
-
from datasets import load_dataset, Dataset
|
4 |
-
from huggingface_hub import HfApi
|
5 |
-
import pandas as pd
|
6 |
-
|
7 |
-
TOKEN = os.environ.get("DEBUG")
|
8 |
-
|
9 |
-
api = HfApi(token=TOKEN)
|
10 |
-
|
11 |
-
out_dir = sys.argv[1]
|
12 |
-
all_attempts_read = open("attempts.txt", "r+").readlines()
|
13 |
-
failed_attempts_read = open("failed_attempts.txt", "r+").readlines()
|
14 |
-
|
15 |
-
# Uploading output to the results dataset.
|
16 |
-
api.upload_folder(
|
17 |
-
folder_path=out_dir,
|
18 |
-
repo_id="AIEnergyScore/results_debug",
|
19 |
-
repo_type="dataset",
|
20 |
-
)
|
21 |
-
|
22 |
-
# Updating requests
|
23 |
-
requests = load_dataset("AIEnergyScore/requests_debug", split="test",
|
24 |
-
token=TOKEN)
|
25 |
-
requests_dset = requests.to_pandas()
|
26 |
-
|
27 |
-
for line in all_attempts_read:
|
28 |
-
experiment_name, model = line.strip().split(',')
|
29 |
-
if line not in failed_attempts_read:
|
30 |
-
requests_dset.loc[
|
31 |
-
requests_dset["model"] == model, ['status']] = "COMPLETED"
|
32 |
-
else:
|
33 |
-
requests_dset.loc[
|
34 |
-
requests_dset["model"] == model, ['status']] = "FAILED"
|
35 |
-
|
36 |
-
updated_dset = Dataset.from_pandas(requests_dset)
|
37 |
-
updated_dset.push_to_hub("AIEnergyScore/requests_debug", split="test",
|
38 |
-
token=TOKEN)
|
39 |
-
print("Updated model status")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
entrypoint.sh
CHANGED
@@ -1,27 +1,36 @@
|
|
1 |
#!/bin/bash
|
2 |
|
|
|
3 |
export SPACE="AIEnergyScore/launch-computation-example"
|
4 |
|
5 |
-
echo "
|
|
|
|
|
6 |
|
|
|
7 |
# For each line in the requests dataset....
|
8 |
python /parse_requests.py | while read -r line; do
|
9 |
-
# Read the name of the model and the experiment.
|
10 |
-
IFS="," read
|
11 |
-
echo "Benchmarking Model: ${
|
12 |
|
13 |
# Initialize the directory for output.
|
14 |
now=$(date +%Y-%m-%d-%H-%M-%S)
|
15 |
-
run_dir="
|
16 |
mkdir -p "$run_dir"
|
17 |
-
|
|
|
18 |
|
19 |
-
#
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
done
|
22 |
|
23 |
echo "Finished; updating requests dataset and results dataset."
|
24 |
-
python /
|
25 |
|
26 |
# Pausing space
|
27 |
echo "Pausing space."
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
+
# TODO: Why is this here? Can we delete it?
|
4 |
export SPACE="AIEnergyScore/launch-computation-example"
|
5 |
|
6 |
+
# Can use this for errors too: trap 'echo "An error occurred."' ERR
|
7 |
+
|
8 |
+
config_dir="/optimum-benchmark/examples/energy_star/"
|
9 |
|
10 |
+
echo "Attempting to run."
|
11 |
# For each line in the requests dataset....
|
12 |
python /parse_requests.py | while read -r line; do
|
13 |
+
# Read the name of the model and the experiment (task).
|
14 |
+
IFS="," read model task <<< "${line}"
|
15 |
+
echo "Benchmarking Model: ${model}, Task: ${task}"
|
16 |
|
17 |
# Initialize the directory for output.
|
18 |
now=$(date +%Y-%m-%d-%H-%M-%S)
|
19 |
+
run_dir="/runs/${task}/${model}/${now}"
|
20 |
mkdir -p "$run_dir"
|
21 |
+
# Save the task/model run directory to text file, for tracking purposes.
|
22 |
+
echo "${run_dir}" >> /attempts.txt
|
23 |
|
24 |
+
{ # try
|
25 |
+
# Let the benchmarking begin!
|
26 |
+
optimum-benchmark --config-name "${task}" --config-dir="${config_dir}" backend.model="${model}" backend.processor="${model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log"
|
27 |
+
} || { # catch
|
28 |
+
echo "${run_dir}" >> /failed_attempts.txt
|
29 |
+
}
|
30 |
done
|
31 |
|
32 |
echo "Finished; updating requests dataset and results dataset."
|
33 |
+
python /process_runs.py
|
34 |
|
35 |
# Pausing space
|
36 |
echo "Pausing space."
|
process_runs.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from datasets import load_dataset, Dataset
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
|
6 |
+
TOKEN = os.environ.get("DEBUG")
|
7 |
+
api = HfApi(token=TOKEN)
|
8 |
+
|
9 |
+
REQUESTS_DSET = "AIEnergyScore/requests_debug"
|
10 |
+
RESULTS_DSET = "AIEnergyScore/results_debug"
|
11 |
+
PENDING = 'PENDING'
|
12 |
+
COMPLETED = 'COMPLETED'
|
13 |
+
FAILED = 'FAILED'
|
14 |
+
|
15 |
+
def parse_args():
|
16 |
+
parser = argparse.ArgumentParser()
|
17 |
+
parser.add_argument(
|
18 |
+
"--run_dir",
|
19 |
+
default="/runs",
|
20 |
+
type=str,
|
21 |
+
required=False,
|
22 |
+
help="Path to the run directory.",
|
23 |
+
)
|
24 |
+
parser.add_argument(
|
25 |
+
"--attempts",
|
26 |
+
default="/attempts.txt",
|
27 |
+
type=str,
|
28 |
+
required=False,
|
29 |
+
help="File with per-line run attempt directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
|
30 |
+
)
|
31 |
+
parser.add_argument(
|
32 |
+
"--failed_attempts",
|
33 |
+
default="/failed_attempts.txt",
|
34 |
+
type=str,
|
35 |
+
required=False,
|
36 |
+
help="File with per-line failed run directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
|
37 |
+
)
|
38 |
+
args = parser.parse_args()
|
39 |
+
return args
|
40 |
+
|
41 |
+
def check_for_traceback(run_dir):
|
42 |
+
# run_dir="./runs/${experiment_name}/${backend_model}/${now}"
|
43 |
+
found_error = False
|
44 |
+
error_message = ""
|
45 |
+
try:
|
46 |
+
# Read error message
|
47 |
+
with open(f"{run_dir}/error.log", 'r') as f:
|
48 |
+
# There may be a better way to do this that finds the
|
49 |
+
# index of Traceback, then prints from there : end-of-file index (the file length-1).
|
50 |
+
for line in f:
|
51 |
+
# Question: Do we even need to check for this? The presence of the
|
52 |
+
# error file, or at least a non-empty one,
|
53 |
+
# means there's been an error, no?
|
54 |
+
if 'Traceback (most recent call last):' in line:
|
55 |
+
found_error = True
|
56 |
+
if found_error:
|
57 |
+
error_message += line
|
58 |
+
except FileNotFoundError as e:
|
59 |
+
# When does this happen?
|
60 |
+
print(f"Could not find {run_dir}/error.log")
|
61 |
+
return error_message
|
62 |
+
|
63 |
+
def update_requests(requests, all_attempts, failed_attempts):
|
64 |
+
"""
|
65 |
+
Sets All PENDING requests with the given model & task to 'COMPLETED' or 'FAILED.'
|
66 |
+
Reads in the all_attempts text file and failed_attempts text file, in which
|
67 |
+
each line is a run directory run_dir="/runs/${experiment_name}/${backend_model}/${now}"
|
68 |
+
|
69 |
+
:param requests: requests Dataset
|
70 |
+
:param all_attempts: text file of the run directories of each task/model/timestamp
|
71 |
+
:param failed_attempts: text file of the run directories of each task/model/timestamp
|
72 |
+
:return:
|
73 |
+
"""
|
74 |
+
requests_df = requests.to_pandas()
|
75 |
+
# Each line is a run directory, where
|
76 |
+
# run_dir="/runs/${experiment_name}/${backend_model}/${now}"
|
77 |
+
for line in all_attempts:
|
78 |
+
split_run_dir = line.strip().split("/")
|
79 |
+
task = split_run_dir[1]
|
80 |
+
model = split_run_dir[2]
|
81 |
+
if line not in failed_attempts:
|
82 |
+
traceback_error = check_for_traceback(line)
|
83 |
+
if traceback_error != "":
|
84 |
+
print("Found a traceback error!")
|
85 |
+
print(traceback_error)
|
86 |
+
requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = FAILED
|
87 |
+
requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['error_message']] = traceback_error
|
88 |
+
else:
|
89 |
+
requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = COMPLETED
|
90 |
+
updated_dset = Dataset.from_pandas(requests_df)
|
91 |
+
return updated_dset
|
92 |
+
|
93 |
+
if __name__ == '__main__':
|
94 |
+
args = parse_args()
|
95 |
+
# Uploads all run output to the results dataset.
|
96 |
+
print(f"Uploading {args.run_dir} to {RESULTS_DSET}")
|
97 |
+
api.upload_folder(
|
98 |
+
folder_path=args.run_dir,
|
99 |
+
repo_id=f"{RESULTS_DSET}",
|
100 |
+
repo_type="dataset",
|
101 |
+
)
|
102 |
+
# Update requests dataset based on whether things have failed or not.
|
103 |
+
print(f"Examining the run directory for each model & task to determine if it {FAILED} or {COMPLETED}.")
|
104 |
+
print(f"Setting the corresponding line in {REQUESTS_DSET} to {FAILED} or {COMPLETED} based on what's in the directory.")
|
105 |
+
requests = load_dataset(f"{REQUESTS_DSET}", split="test", token=TOKEN)
|
106 |
+
all_attempts = open(f"{args.attempts}", "r+").readlines()
|
107 |
+
failed_attempts = open(f"{args.failed_attempts}", "r+").readlines()
|
108 |
+
updated_requests = update_requests(requests, all_attempts, failed_attempts)
|
109 |
+
print(f"Uploading updated {REQUESTS_DSET}.")
|
110 |
+
updated_requests.push_to_hub(f"{REQUESTS_DSET}", split="test", token=TOKEN)
|
111 |
+
print("Done.")
|