zackliqcom
/

qdc-test-script

Model card Files Files and versions

xet

Community

zackliqcom commited on 15 days ago

Commit

39dbdce

verified ·

1 Parent(s): 215c26f

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

run_backend_ops.py +28 -2
run_bench_tests.py +69 -4
utils.py +226 -29

run_backend_ops.py CHANGED Viewed

@@ -15,28 +15,54 @@ import sys
 import pytest
-from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_shell_command, write_qdc_log
 @pytest.fixture(scope="session", autouse=True)
 def install():
     """Push llama_cpp_bundle to the device if needed."""
-    push_bundle_if_needed(f"{BIN_PATH}/test-backend-ops")
 @pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"])
 def test_backend_ops_htp0(type_a):
     cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT"
     if type_a == "q4_0":
         cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"'
     else:
         cmd += f" -p type_a={type_a}"
     result = run_shell_command(
         cmd,
         check=False,
     )
     write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "")
     assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})"
 if __name__ == "__main__":

 import pytest
+from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_shell_command, verify_binary_exists, write_qdc_log
 @pytest.fixture(scope="session", autouse=True)
 def install():
     """Push llama_cpp_bundle to the device if needed."""
+    binary_path = f"{BIN_PATH}/test-backend-ops"
+    push_bundle_if_needed(binary_path)
+    # Verify binary exists and is executable
+    if not verify_binary_exists(binary_path):
+        raise RuntimeError(f"Required binary not found or not executable: {binary_path}")
 @pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"])
 def test_backend_ops_htp0(type_a):
+    print(f"[TEST] Running backend-ops test for type_a={type_a}")
+    # Double-check binary exists before running (paranoid check)
+    binary = f"{BIN_PATH}/test-backend-ops"
+    if not verify_binary_exists(binary):
+        pytest.fail(f"Binary disappeared between setup and test execution: {binary}")
     cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT"
     if type_a == "q4_0":
         cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"'
     else:
         cmd += f" -p type_a={type_a}"
+    print(f"[TEST] Executing test-backend-ops with type_a={type_a}")
     result = run_shell_command(
         cmd,
         check=False,
     )
+    # Save log regardless of pass/fail
     write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "")
+    if result.returncode != 0:
+        print(f"[TEST FAILED] test-backend-ops type_a={type_a} failed with exit code {result.returncode}")
+        # Print last 50 lines of output for debugging
+        if result.stdout:
+            lines = result.stdout.split("\n")
+            print("[TEST FAILED] Last 50 lines of output:")
+            print("\n".join(lines[-50:]))
     assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})"
+    print(f"[TEST PASSED] type_a={type_a}")
 if __name__ == "__main__":

run_bench_tests.py CHANGED Viewed

@@ -18,7 +18,7 @@ import sys
 import pytest
-from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_shell_command, write_qdc_log
 MODEL_PATH = "/tmp/model.gguf"
 PROMPT = "What is the capital of France?"
@@ -28,12 +28,41 @@ CLI_OPTS = "--batch-size 128 -n 128 -no-cnv --seed 42"
 @pytest.fixture(scope="session", autouse=True)
 def install():
     """Push llama_cpp_bundle to the device and download model if needed."""
-    push_bundle_if_needed(f"{BIN_PATH}/llama-cli")
-    # Skip model download if already present
     result = run_shell_command(f"ls {MODEL_PATH}", check=False)
     if result.returncode != 0:
-        run_shell_command(f'curl -L -J --output {MODEL_PATH} "<<MODEL_URL>>"')
 @pytest.mark.parametrize(
@@ -45,14 +74,32 @@ def install():
     ],
 )
 def test_llama_completion(device, extra_flags):
     result = run_shell_command(
         f'{CMD_PREFIX} {BIN_PATH}/llama-completion'
         f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on'
         f' -p "{PROMPT}"',
         check=False,
     )
     write_qdc_log(f"llama_completion_{device}.log", result.stdout or "")
     assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})"
 _DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
@@ -67,13 +114,31 @@ _DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
     ],
 )
 def test_llama_bench(device):
     result = run_shell_command(
         f"{CMD_PREFIX} {BIN_PATH}/llama-bench"
         f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32",
         check=False,
     )
     write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "")
     assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})"
 if __name__ == "__main__":

 import pytest
+from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_shell_command, verify_binary_exists, write_qdc_log
 MODEL_PATH = "/tmp/model.gguf"
 PROMPT = "What is the capital of France?"
 @pytest.fixture(scope="session", autouse=True)
 def install():
     """Push llama_cpp_bundle to the device and download model if needed."""
+    # Check and verify required binaries
+    llama_cli = f"{BIN_PATH}/llama-cli"
+    llama_completion = f"{BIN_PATH}/llama-completion"
+    llama_bench = f"{BIN_PATH}/llama-bench"
+    push_bundle_if_needed(llama_cli)
+    for binary in [llama_cli, llama_completion, llama_bench]:
+        if not verify_binary_exists(binary):
+            raise RuntimeError(f"Required binary not found or not executable: {binary}")
+    # Check model file
+    print(f"[DEBUG] Checking if model exists: {MODEL_PATH}")
     result = run_shell_command(f"ls {MODEL_PATH}", check=False)
     if result.returncode != 0:
+        print(f"[DEBUG] Model not found, downloading from <<MODEL_URL>>")
+        model_url = "<<MODEL_URL>>"
+        if model_url == "<<MODEL_URL>>":
+            print("[ERROR] MODEL_URL placeholder not replaced!")
+            print("[ERROR] This should be replaced by run_qdc_jobs.py during artifact creation")
+            raise RuntimeError("MODEL_URL placeholder not replaced")
+        run_shell_command(f'curl -L -J --output {MODEL_PATH} "{model_url}"')
+        # Verify download succeeded
+        verify_result = run_shell_command(f"test -f {MODEL_PATH}", check=False)
+        if verify_result.returncode != 0:
+            raise RuntimeError(f"Model download failed: {MODEL_PATH}")
+        # Check model file size
+        size_result = run_shell_command(f"ls -lh {MODEL_PATH}", check=False)
+        if size_result.returncode == 0:
+            print(f"[DEBUG] Downloaded model: {size_result.stdout.strip()}")
+    else:
+        print(f"[DEBUG] Model already exists: {MODEL_PATH}")
 @pytest.mark.parametrize(
     ],
 )
 def test_llama_completion(device, extra_flags):
+    print(f"[TEST] Running llama-completion test for device={device}")
+    # Verify binary and model exist
+    binary = f"{BIN_PATH}/llama-completion"
+    if not verify_binary_exists(binary):
+        pytest.fail(f"Binary not found: {binary}")
+    model_check = run_shell_command(f"test -f {MODEL_PATH}", check=False)
+    if model_check.returncode != 0:
+        pytest.fail(f"Model file not found: {MODEL_PATH}")
+    print(f"[TEST] Executing llama-completion on device={device}")
     result = run_shell_command(
         f'{CMD_PREFIX} {BIN_PATH}/llama-completion'
         f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on'
         f' -p "{PROMPT}"',
         check=False,
     )
     write_qdc_log(f"llama_completion_{device}.log", result.stdout or "")
+    if result.returncode != 0:
+        print(f"[TEST FAILED] llama-completion device={device} failed with exit code {result.returncode}")
     assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})"
+    print(f"[TEST PASSED] device={device}")
 _DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
     ],
 )
 def test_llama_bench(device):
+    print(f"[TEST] Running llama-bench test for device={device}")
+    # Verify binary and model exist
+    binary = f"{BIN_PATH}/llama-bench"
+    if not verify_binary_exists(binary):
+        pytest.fail(f"Binary not found: {binary}")
+    model_check = run_shell_command(f"test -f {MODEL_PATH}", check=False)
+    if model_check.returncode != 0:
+        pytest.fail(f"Model file not found: {MODEL_PATH}")
+    print(f"[TEST] Executing llama-bench on device={device}")
     result = run_shell_command(
         f"{CMD_PREFIX} {BIN_PATH}/llama-bench"
         f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32",
         check=False,
     )
     write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "")
+    if result.returncode != 0:
+        print(f"[TEST FAILED] llama-bench device={device} failed with exit code {result.returncode}")
     assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})"
+    print(f"[TEST PASSED] device={device}")
 if __name__ == "__main__":

utils.py CHANGED Viewed

@@ -44,22 +44,103 @@ options.set_capability("deviceName", os.getenv("QDC_DEVICE_NAME", "QCS9075M"))
 # ---------------------------------------------------------------------------
 def run_shell_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess:
     """Run a shell command on the Linux device.
     For QDC Linux devices, commands are executed through the QDC infrastructure
     which provides SSH access to the device. The QDC Appium driver handles the
     SSH tunneling transparently.
     """
-    raw = subprocess.run(
-        ["ssh", os.getenv("QDC_DEVICE_HOST", "localhost"), f"{cmd}; echo __RC__:$?"],
-        text=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        timeout=300,
-    )
     stdout = raw.stdout
     returncode = raw.returncode
     if stdout:
         lines = stdout.rstrip("\n").split("\n")
         if lines and lines[-1].startswith("__RC__:"):
@@ -67,39 +148,155 @@ def run_shell_command(cmd: str, *, check: bool = True) -> subprocess.CompletedPr
                 returncode = int(lines[-1][7:])
                 stdout = "\n".join(lines[:-1]) + "\n"
             except ValueError:
-                pass
     print(stdout)
     result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout)
     if check:
-        assert returncode == 0, f"Command failed (exit {returncode})"
     return result
 def write_qdc_log(filename: str, content: str) -> None:
     """Write content as a log file to QDC_LOGS_PATH on the device for QDC log collection."""
-    run_shell_command(f"mkdir -p {QDC_LOGS_PATH}", check=False)
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f:
-        f.write(content)
-        tmp_path = f.name
     try:
-        subprocess.run(
-            ["scp", tmp_path, f"{os.getenv('QDC_DEVICE_HOST', 'localhost')}:{QDC_LOGS_PATH}/{filename}"],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            timeout=60,
-        )
-    finally:
-        os.unlink(tmp_path)
 def push_bundle_if_needed(check_binary: str) -> None:
     """Push llama_cpp_bundle to the device if check_binary is not already present."""
     result = run_shell_command(f"ls {check_binary}", check=False)
-    if result.returncode != 0:
-        subprocess.run(
-            ["scp", "-r", "/qdc/appium/llama_cpp_bundle/", f"{os.getenv('QDC_DEVICE_HOST', 'localhost')}:/tmp/"],
-            text=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            timeout=120,
-        )

 # ---------------------------------------------------------------------------
+def verify_binary_exists(binary_path: str) -> bool:
+    """Verify that a binary exists and is executable.
+    Args:
+        binary_path: Full path to the binary to check
+    Returns:
+        True if binary exists and is executable, False otherwise
+    """
+    print(f"[DEBUG] Verifying binary: {binary_path}")
+    # Check if file exists
+    result = run_shell_command(f"test -f {binary_path}", check=False)
+    if result.returncode != 0:
+        print(f"[ERROR] Binary does not exist: {binary_path}")
+        print(f"[ERROR] Expected location: {binary_path}")
+        print(f"[ERROR] Bundle should be at: {BUNDLE_PATH}")
+        print(f"[ERROR] Check if binaries were pushed correctly")
+        return False
+    # Check if executable
+    result = run_shell_command(f"test -x {binary_path}", check=False)
+    if result.returncode != 0:
+        print(f"[WARNING] Binary exists but is not executable: {binary_path}")
+        print(f"[DEBUG] Attempting to set executable permissions")
+        chmod_result = run_shell_command(f"chmod +x {binary_path}", check=False)
+        if chmod_result.returncode != 0:
+            print(f"[ERROR] Failed to set executable permissions on {binary_path}")
+            return False
+        print(f"[DEBUG] Successfully set executable permissions")
+    # Get file info for debugging
+    ls_result = run_shell_command(f"ls -lh {binary_path}", check=False)
+    if ls_result.returncode == 0:
+        print(f"[DEBUG] Binary info: {ls_result.stdout.strip()}")
+    print(f"[DEBUG] Binary verified: {binary_path}")
+    return True
 def run_shell_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess:
     """Run a shell command on the Linux device.
     For QDC Linux devices, commands are executed through the QDC infrastructure
     which provides SSH access to the device. The QDC Appium driver handles the
     SSH tunneling transparently.
+    When running directly on-device (QDC_DEVICE_HOST=localhost), executes
+    commands locally via shell to avoid SSH password prompts.
     """
+    device_host = os.getenv("QDC_DEVICE_HOST", "localhost")
+    print(f"[DEBUG] Running command on device_host='{device_host}'")
+    print(f"[DEBUG] Command: {cmd[:200]}{'...' if len(cmd) > 200 else ''}")
+    try:
+        # If localhost, run directly via shell (avoids SSH password prompt for on-device testing)
+        if device_host == "localhost":
+            raw = subprocess.run(
+                ["/bin/sh", "-c", f"{cmd}; echo __RC__:$?"],
+                text=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                timeout=300,
+            )
+        else:
+            # Remote device: use SSH
+            print(f"[DEBUG] Using SSH to connect to {device_host}")
+            raw = subprocess.run(
+                ["ssh", "-o", "BatchMode=yes", "-o", "ConnectTimeout=10",
+                 device_host, f"{cmd}; echo __RC__:$?"],
+                text=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                timeout=300,
+            )
+            # Check for SSH authentication failures
+            if raw.returncode != 0 and raw.stdout:
+                if "Permission denied" in raw.stdout:
+                    print("[ERROR] SSH authentication failed! Password required or key not set up.")
+                    print("[ERROR] To fix: Set up passwordless SSH or set QDC_DEVICE_HOST=localhost if on-device")
+                elif "Connection refused" in raw.stdout:
+                    print(f"[ERROR] SSH connection refused to {device_host}. Is SSH server running?")
+                elif "Host key verification failed" in raw.stdout:
+                    print(f"[ERROR] SSH host key verification failed for {device_host}")
+                    print("[ERROR] To fix: ssh-keyscan {device_host} >> ~/.ssh/known_hosts")
+    except subprocess.TimeoutExpired as e:
+        print(f"[ERROR] Command timed out after 300 seconds")
+        print(f"[ERROR] Command was: {cmd[:200]}")
+        raise
     stdout = raw.stdout
     returncode = raw.returncode
+    # Parse exit code from __RC__: sentinel
     if stdout:
         lines = stdout.rstrip("\n").split("\n")
         if lines and lines[-1].startswith("__RC__:"):
                 returncode = int(lines[-1][7:])
                 stdout = "\n".join(lines[:-1]) + "\n"
             except ValueError:
+                print(f"[WARNING] Failed to parse exit code from: {lines[-1]}")
     print(stdout)
+    if returncode != 0:
+        print(f"[ERROR] Command failed with exit code {returncode}")
+        # Try to provide helpful context for common errors
+        if "No such file or directory" in stdout:
+            print("[ERROR] File or directory not found. Check if binaries were pushed to device.")
+            print(f"[ERROR] Expected bundle path: {BUNDLE_PATH}")
+        elif "Permission denied" in stdout and device_host == "localhost":
+            print("[ERROR] Permission denied. Check file permissions or run with appropriate privileges.")
+        elif "not found" in stdout.lower() or "command not found" in stdout.lower():
+            print("[ERROR] Command not found. Check if binary exists and is in PATH or use full path.")
     result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout)
     if check:
+        assert returncode == 0, f"Command failed (exit {returncode})\nCommand: {cmd[:200]}\nOutput: {stdout[:500]}"
     return result
 def write_qdc_log(filename: str, content: str) -> None:
     """Write content as a log file to QDC_LOGS_PATH on the device for QDC log collection."""
+    print(f"[DEBUG] Writing QDC log: {filename} ({len(content)} bytes)")
+    # Ensure log directory exists
+    mkdir_result = run_shell_command(f"mkdir -p {QDC_LOGS_PATH}", check=False)
+    if mkdir_result.returncode != 0:
+        print(f"[WARNING] Failed to create log directory {QDC_LOGS_PATH}: {mkdir_result.stdout}")
+    device_host = os.getenv("QDC_DEVICE_HOST", "localhost")
     try:
+        if device_host == "localhost":
+            # Running on-device: write directly to filesystem
+            log_path = f"{QDC_LOGS_PATH}/{filename}"
+            with open(log_path, "w") as f:
+                f.write(content)
+            print(f"[DEBUG] Successfully wrote log to {log_path}")
+        else:
+            # Remote device: use SCP
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f:
+                f.write(content)
+                tmp_path = f.name
+            print(f"[DEBUG] Using SCP to transfer log to {device_host}")
+            result = subprocess.run(
+                ["scp", "-o", "BatchMode=yes", "-o", "ConnectTimeout=10",
+                 tmp_path, f"{device_host}:{QDC_LOGS_PATH}/{filename}"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                timeout=60,
+                text=True,
+            )
+            if result.returncode != 0:
+                print(f"[ERROR] SCP failed with exit code {result.returncode}")
+                print(f"[ERROR] Output: {result.stdout}")
+            else:
+                print(f"[DEBUG] Successfully transferred log to {device_host}:{QDC_LOGS_PATH}/{filename}")
+            os.unlink(tmp_path)
+    except Exception as e:
+        print(f"[ERROR] Failed to write QDC log {filename}: {e}")
+        raise
 def push_bundle_if_needed(check_binary: str) -> None:
     """Push llama_cpp_bundle to the device if check_binary is not already present."""
+    print(f"[DEBUG] Checking if binary exists: {check_binary}")
     result = run_shell_command(f"ls {check_binary}", check=False)
+    if result.returncode == 0:
+        print(f"[DEBUG] Binary already exists on device: {check_binary}")
+        return
+    print(f"[WARNING] Binary not found: {check_binary}")
+    print(f"[DEBUG] Will attempt to push bundle from /qdc/appium/llama_cpp_bundle/ to {BUNDLE_PATH}")
+    device_host = os.getenv("QDC_DEVICE_HOST", "localhost")
+    source_path = "/qdc/appium/llama_cpp_bundle/"
+    try:
+        if device_host == "localhost":
+            # Running on-device: copy locally (if source exists)
+            if not os.path.exists(source_path):
+                print(f"[ERROR] Source bundle not found at {source_path}")
+                print(f"[ERROR] You may need to manually copy binaries to {BUNDLE_PATH}")
+                print(f"[ERROR] Expected structure: {BUNDLE_PATH}/{{bin,lib}}/")
+                return
+            print(f"[DEBUG] Copying bundle from {source_path} to /tmp/")
+            result = subprocess.run(
+                ["cp", "-r", source_path, "/tmp/"],
+                text=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                timeout=120,
+            )
+            if result.returncode != 0:
+                print(f"[ERROR] Failed to copy bundle: {result.stdout}")
+            else:
+                print(f"[DEBUG] Successfully copied bundle to {BUNDLE_PATH}")
+                # Verify the copy succeeded
+                verify = run_shell_command(f"ls {check_binary}", check=False)
+                if verify.returncode != 0:
+                    print(f"[ERROR] Bundle copied but binary still not found: {check_binary}")
+                else:
+                    print(f"[DEBUG] Verified binary exists after copy: {check_binary}")
+        else:
+            # Remote device: use SCP
+            print(f"[DEBUG] Using SCP to transfer bundle to {device_host}:/tmp/")
+            if not os.path.exists(source_path):
+                print(f"[ERROR] Source bundle not found at {source_path}")
+                print(f"[ERROR] Cannot push to remote device")
+                return
+            result = subprocess.run(
+                ["scp", "-r", "-o", "BatchMode=yes", "-o", "ConnectTimeout=10",
+                 source_path, f"{device_host}:/tmp/"],
+                text=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                timeout=120,
+            )
+            if result.returncode != 0:
+                print(f"[ERROR] SCP failed: {result.stdout}")
+                if "Permission denied" in result.stdout:
+                    print("[ERROR] SSH authentication failed. Set up passwordless SSH.")
+            else:
+                print(f"[DEBUG] Successfully transferred bundle to {device_host}:{BUNDLE_PATH}")
+                # Verify the transfer succeeded
+                verify = run_shell_command(f"ls {check_binary}", check=False)
+                if verify.returncode != 0:
+                    print(f"[ERROR] Bundle transferred but binary still not found: {check_binary}")
+                else:
+                    print(f"[DEBUG] Verified binary exists after transfer: {check_binary}")
+    except subprocess.TimeoutExpired:
+        print(f"[ERROR] Timeout while pushing bundle (exceeded 120 seconds)")
+        raise
+    except Exception as e:
+        print(f"[ERROR] Unexpected error while pushing bundle: {e}")
+        raise