Spaces:

AIEnergyScore
/

launch-computation-example

Runtime error

App Files Files Community

meg-huggingface commited on Oct 25, 2024

Commit

939c209

1 Parent(s): b891a6a

Adding Traceback handling

Browse files

Files changed (4) hide show

Dockerfile +1 -2
create_results.py +0 -39
entrypoint.sh +18 -9
process_runs.py +111 -0

Dockerfile CHANGED Viewed

@@ -21,7 +21,6 @@ RUN mkdir -p .cache
 RUN chmod 777 -R .cache
 #RUN chmod 777 -R data
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -61,7 +60,7 @@ COPY ./.cache /.cache
 COPY ./entrypoint.sh /entrypoint.sh
 COPY ./pause_space.py /pause_space.py
 COPY ./parse_requests.py /parse_requests.py
-COPY ./create_results.py /create_results.py
 COPY ./runs /runs
 COPY ./attempts.txt /attempts.txt
 COPY ./failed_attempts.txt /failed_attempts.txt

 RUN chmod 777 -R .cache
 #RUN chmod 777 -R data
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
 COPY ./entrypoint.sh /entrypoint.sh
 COPY ./pause_space.py /pause_space.py
 COPY ./parse_requests.py /parse_requests.py
+COPY ./create_results.py /process_runs.py
 COPY ./runs /runs
 COPY ./attempts.txt /attempts.txt
 COPY ./failed_attempts.txt /failed_attempts.txt

create_results.py DELETED Viewed

@@ -1,39 +0,0 @@
-import os
-import sys
-from datasets import load_dataset, Dataset
-from huggingface_hub import HfApi
-import pandas as pd
-TOKEN = os.environ.get("DEBUG")
-api = HfApi(token=TOKEN)
-out_dir = sys.argv[1]
-all_attempts_read = open("attempts.txt", "r+").readlines()
-failed_attempts_read = open("failed_attempts.txt", "r+").readlines()
-# Uploading output to the results dataset.
-api.upload_folder(
-    folder_path=out_dir,
-    repo_id="AIEnergyScore/results_debug",
-    repo_type="dataset",
-)
-# Updating requests
-requests = load_dataset("AIEnergyScore/requests_debug", split="test",
-                        token=TOKEN)
-requests_dset = requests.to_pandas()
-for line in all_attempts_read:
-    experiment_name, model = line.strip().split(',')
-    if line not in failed_attempts_read:
-        requests_dset.loc[
-            requests_dset["model"] == model, ['status']] = "COMPLETED"
-    else:
-        requests_dset.loc[
-            requests_dset["model"] == model, ['status']] = "FAILED"
-updated_dset = Dataset.from_pandas(requests_dset)
-updated_dset.push_to_hub("AIEnergyScore/requests_debug", split="test",
-                         token=TOKEN)
-print("Updated model status")

entrypoint.sh CHANGED Viewed

@@ -1,27 +1,36 @@
 #!/bin/bash
 export SPACE="AIEnergyScore/launch-computation-example"
-echo "Attempting to run."
 # For each line in the requests dataset....
 python /parse_requests.py | while read -r line; do
-    # Read the name of the model and the experiment.
-    IFS="," read backend_model experiment_name <<< "${line}"
-    echo "Benchmarking Model: ${backend_model}, Task: ${experiment_name}"
     # Initialize the directory for output.
     now=$(date +%Y-%m-%d-%H-%M-%S)
-    run_dir="./runs/${experiment_name}/${backend_model}/${now}"
     mkdir -p "$run_dir"
-    echo "${experiment_name},${backend_model}" >> /attempts.txt
-    # Let the benchmarking begin!
-    optimum-benchmark --config-name "${experiment_name}"  --config-dir /optimum-benchmark/examples/energy_star/ backend.model="${backend_model}" backend.processor="${backend_model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log" || echo "${experiment_name},${backend_model}" >> /failed_attempts.txt
 done
 echo "Finished; updating requests dataset and results dataset."
-python /create_results.py ./runs
 # Pausing space
 echo "Pausing space."

 #!/bin/bash
+# TODO: Why is this here? Can we delete it?
 export SPACE="AIEnergyScore/launch-computation-example"
+# Can use this for errors too: trap 'echo "An error occurred."' ERR
+config_dir="/optimum-benchmark/examples/energy_star/"
+echo "Attempting to run."
 # For each line in the requests dataset....
 python /parse_requests.py | while read -r line; do
+    # Read the name of the model and the experiment (task).
+    IFS="," read model task <<< "${line}"
+    echo "Benchmarking Model: ${model}, Task: ${task}"
     # Initialize the directory for output.
     now=$(date +%Y-%m-%d-%H-%M-%S)
+    run_dir="/runs/${task}/${model}/${now}"
     mkdir -p "$run_dir"
+    # Save the task/model run directory to text file, for tracking purposes.
+    echo "${run_dir}" >> /attempts.txt
+    { # try
+      # Let the benchmarking begin!
+      optimum-benchmark --config-name "${task}" --config-dir="${config_dir}" backend.model="${model}" backend.processor="${model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log"
+    } || { # catch
+        echo "${run_dir}" >> /failed_attempts.txt
+    }
 done
 echo "Finished; updating requests dataset and results dataset."
+python /process_runs.py
 # Pausing space
 echo "Pausing space."

process_runs.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import argparse
+import os
+from datasets import load_dataset, Dataset
+from huggingface_hub import HfApi
+TOKEN = os.environ.get("DEBUG")
+api = HfApi(token=TOKEN)
+REQUESTS_DSET = "AIEnergyScore/requests_debug"
+RESULTS_DSET = "AIEnergyScore/results_debug"
+PENDING = 'PENDING'
+COMPLETED = 'COMPLETED'
+FAILED = 'FAILED'
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--run_dir",
+        default="/runs",
+        type=str,
+        required=False,
+        help="Path to the run directory.",
+    )
+    parser.add_argument(
+        "--attempts",
+        default="/attempts.txt",
+        type=str,
+        required=False,
+        help="File with per-line run attempt directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
+    )
+    parser.add_argument(
+        "--failed_attempts",
+        default="/failed_attempts.txt",
+        type=str,
+        required=False,
+        help="File with per-line failed run directories. Assumes format '/runs/{task}/{model}/{timestamp}'",
+    )
+    args = parser.parse_args()
+    return args
+def check_for_traceback(run_dir):
+    # run_dir="./runs/${experiment_name}/${backend_model}/${now}"
+    found_error = False
+    error_message = ""
+    try:
+        # Read error message
+        with open(f"{run_dir}/error.log", 'r') as f:
+            # There may be a better way to do this that finds the
+            # index of Traceback, then prints from there : end-of-file index (the file length-1).
+            for line in f:
+                # Question: Do we even need to check for this? The presence of the
+                # error file, or at least a non-empty one,
+                # means there's been an error, no?
+                if 'Traceback (most recent call last):' in line:
+                    found_error = True
+                if found_error:
+                    error_message += line
+    except FileNotFoundError as e:
+        # When does this happen?
+        print(f"Could not find {run_dir}/error.log")
+    return error_message
+def update_requests(requests, all_attempts, failed_attempts):
+    """
+     Sets All PENDING requests with the given model & task to 'COMPLETED' or 'FAILED.'
+     Reads in the all_attempts text file and failed_attempts text file, in which
+      each line is a run directory run_dir="/runs/${experiment_name}/${backend_model}/${now}"
+    :param requests: requests Dataset
+    :param all_attempts: text file of the run directories of each task/model/timestamp
+    :param failed_attempts: text file of the run directories of each task/model/timestamp
+    :return:
+    """
+    requests_df = requests.to_pandas()
+    # Each line is a run directory, where
+    # run_dir="/runs/${experiment_name}/${backend_model}/${now}"
+    for line in all_attempts:
+        split_run_dir = line.strip().split("/")
+        task = split_run_dir[1]
+        model = split_run_dir[2]
+        if line not in failed_attempts:
+            traceback_error = check_for_traceback(line)
+            if traceback_error != "":
+                print("Found a traceback error!")
+                print(traceback_error)
+                requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = FAILED
+                requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['error_message']] = traceback_error
+            else:
+                requests_df.loc[(requests_df["status"] == PENDING) & (requests_df["model"] == model) & (requests_df["task"] == task), ['status']] = COMPLETED
+    updated_dset = Dataset.from_pandas(requests_df)
+    return updated_dset
+if __name__ == '__main__':
+    args = parse_args()
+    # Uploads all run output to the results dataset.
+    print(f"Uploading {args.run_dir} to {RESULTS_DSET}")
+    api.upload_folder(
+        folder_path=args.run_dir,
+        repo_id=f"{RESULTS_DSET}",
+        repo_type="dataset",
+    )
+    # Update requests dataset based on whether things have failed or not.
+    print(f"Examining the run directory for each model & task to determine if it {FAILED} or {COMPLETED}.")
+    print(f"Setting the corresponding line in {REQUESTS_DSET} to {FAILED} or {COMPLETED} based on what's in the directory.")
+    requests = load_dataset(f"{REQUESTS_DSET}", split="test", token=TOKEN)
+    all_attempts = open(f"{args.attempts}", "r+").readlines()
+    failed_attempts = open(f"{args.failed_attempts}", "r+").readlines()
+    updated_requests = update_requests(requests, all_attempts, failed_attempts)
+    print(f"Uploading updated {REQUESTS_DSET}.")
+    updated_requests.push_to_hub(f"{REQUESTS_DSET}", split="test", token=TOKEN)
+    print("Done.")