Docgenie-API / api /tests /compile_results.py
Ahadhassan-2003
deploy: update HF Space
dc4e6da
#!/usr/bin/env python3
"""
Compile test results from artifacts/combined_results.json into the
DOCGENIE_API_TEST_RESULTS.md document in the project root.
Usage:
python docgenie/api/tests/compile_results.py
"""
import json
import pathlib
import datetime
import sys
HERE = pathlib.Path(__file__).parent
ARTIFACTS = HERE / "artifacts"
ROOT = HERE.parent.parent.parent # FYP project root
OUT_FILE = ROOT / "DOCGENIE_API_TEST_RESULTS.md"
COMBINED = ARTIFACTS / "combined_results.json"
API_HOST = "text-to-document-generation-docgenie-api.hf.space"
BASE_URL = f"https://{API_HOST}"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def load_perf_metrics() -> dict:
"""Read timing metrics saved by the performance conftest session fixture."""
perf_file = ARTIFACTS / "perf_metrics.json"
if not perf_file.exists():
return {}
try:
return json.loads(perf_file.read_text())
except Exception:
return {}
def load_reliability_metrics() -> dict:
"""Read reliability metrics saved by the reliability conftest session fixture."""
rel_file = ARTIFACTS / "reliability_metrics.json"
if not rel_file.exists():
return {}
try:
return json.loads(rel_file.read_text())
except Exception:
return {}
def fmt_table(headers: list, rows: list) -> str:
lines = ["| " + " | ".join(headers) + " |"]
lines.append("|" + "|".join(["---"] * len(headers)) + "|")
for row in rows:
lines.append("| " + " | ".join(str(c) for c in row) + " |")
return "\n".join(lines)
def outcome_emoji(outcome: str) -> str:
return {"PASSED": "βœ… PASSED", "FAILED": "❌ FAILED",
"ERROR": "πŸ’₯ ERROR", "SKIPPED": "⏭ SKIPPED"}.get(outcome, outcome)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def compile_results():
if not COMBINED.exists():
print(f"ERROR: {COMBINED} not found. Run run_all_tests.py first.")
sys.exit(1)
data = json.loads(COMBINED.read_text())
suites = {s["name"]: s for s in data["suites"]}
gen_ts = data.get("generated", datetime.datetime.now().isoformat())[:19].replace("T", " ")
func_suite = suites.get("functional", {})
perf_suite = suites.get("performance", {})
rel_suite = suites.get("reliability", {})
def counts(s): return s.get("counts", {})
perf_m = load_perf_metrics()
rel_m = load_reliability_metrics()
# -----------------------------------------------------------------------
# Build markdown
# -----------------------------------------------------------------------
md = []
md.append("# DocGenie API β€” Test Results\n")
md.append(f"**Target API:** `{BASE_URL}` ")
md.append(f"**Generated:** {gen_ts} ")
md.append(f"**Test framework:** pytest, Python 3.11 \n")
md.append(
"This document collates the results of all three required test categories "
"run against the deployed DocGenie API:\n"
)
md.append("1. **Functional Testing (Unit Testing)** β€” verifies every endpoint "
"behaves to spec")
md.append("2. **Non-Functional Testing (Performance Testing)** β€” measures latency, "
"throughput, and concurrent behaviour")
md.append("3. **Non-Functional Testing (Reliability Testing)** β€” verifies the API "
"stays correct under repeated and faulty input\n")
md.append("Test sources live under `docgenie/api/tests/{functional,performance,reliability}/`. ")
md.append("Raw artifacts live under `docgenie/api/tests/artifacts/`.\n")
# -- Environment ---------------------------------------------------------
md.append("\n## Test Environment\n")
md.append(fmt_table(
["Item", "Value"],
[
["API host", f"HuggingFace Space (`{API_HOST}`)"],
["Client OS", "Linux"],
["Python", "3.11"],
["HTTP client", "`requests` 2.x"],
["Concurrency model","` concurrent.futures.ThreadPoolExecutor`"],
["Async queue", "Redis + RQ (deployed)"],
]
))
# -- Endpoint coverage table -------------------------------------------
md.append("\n\n### Endpoints Under Test\n")
md.append(fmt_table(
["Endpoint", "Method", "Suite"],
[
["`GET /`", "GET", "Functional"],
["`GET /health`", "GET", "Functional"],
["`POST /generate/pdf`", "POST", "Functional"],
["`POST /generate/async`", "POST", "Functional"],
["`GET /jobs/{request_id}/status`","GET", "Functional"],
["`GET /jobs/user/{user_id}`", "GET", "Functional"],
]
))
# =========================================================================
# 1. Functional
# =========================================================================
fc = counts(func_suite)
md.append(f"\n\n## 1. Functional Testing (Unit Testing)\n")
md.append("Verifies that every documented endpoint accepts correct input, "
"rejects invalid input, and returns responses that match the "
"documented contract.\n")
md.append(f"**Pytest summary:** `{func_suite.get('summary_line', 'n/a')}`\n")
md.append(f"**Counts:** {fc.get('total',0)} total β€” "
f"{fc.get('passed',0)} passed, {fc.get('failed',0)} failed, "
f"{fc.get('error',0)} errors\n")
md.append("\n### Per-test results\n")
md.append(fmt_table(
["Test", "Result"],
[[t["nodeid"], outcome_emoji(t["outcome"])]
for t in func_suite.get("tests", [])]
))
md.append("\n\n### What is covered\n")
md.append(fmt_table(
["Endpoint", "Tests"],
[
["`GET /`",
"returns `healthy`, has `version` field, schema contract, content-type"],
["`GET /health`",
"returns `healthy`, has `version`, schema contract, agrees with `/`"],
["`POST /generate/pdf`",
"422 for missing/bad fields; 404 for unknown request_id; "
"Swagger 'string' tokens sanitised; boundary `num_solutions` accepted; "
"optional `prompt_params` uses defaults; `user_id/` prefix parsed"],
["`POST /generate/async`",
"422 for missing/bad fields; 404 or 503 for unknown request_id; "
"boundary values accepted; optional params use defaults"],
["`GET /jobs/{request_id}/status`",
"404/500 for unknown UUID; error is JSON with `detail`; "
"garbage id returns error; GET-only (POST β†’ 405); "
"status field constrained to known values; 200 contract verified"],
["`GET /jobs/user/{user_id}`",
"200 for any integer; JSON with `user_id`, `jobs`, `count`, `limit`, `offset`; "
"`count` == `len(jobs)`; user_id echoed; default limit=50/offset=0; "
"custom limit/offset respected; limit capped at 100; non-int β†’ 422; POST β†’ 405"],
]
))
# =========================================================================
# 2. Performance
# =========================================================================
pc = counts(perf_suite)
md.append(f"\n\n## 2. Performance Testing (incl. Concurrent Testing)\n")
md.append(f"**Pytest summary:** `{perf_suite.get('summary_line', 'n/a')}`\n")
md.append(f"**Counts:** {pc.get('total',0)} total β€” "
f"{pc.get('passed',0)} passed, {pc.get('failed',0)} failed, "
f"{pc.get('error',0)} errors\n")
# 2.1 Lightweight latency
md.append("\n### 2.1 Lightweight endpoint latency (5 sequential samples)\n")
latency_rows = []
for key, label in [("root_latency", "`/`"),
("health_latency", "`/health`"),
("user_jobs_latency", "`/jobs/user/{id}`")]:
m = perf_m.get(key, {})
if m:
latency_rows.append([
label, m.get("n", "-"), m.get("min_s", "-"),
m.get("mean_s", "-"), m.get("median_s", "-"), m.get("max_s", "-"),
])
if latency_rows:
md.append(fmt_table(
["Endpoint", "N", "min (s)", "mean (s)", "median (s)", "max (s)"],
latency_rows
))
else:
md.append("_Latency data not captured β€” run with `-s` flag._\n")
# 2.2 Validation (422) path latency
md.append("\n\n### 2.2 Input-validation latency (422 path, 5 samples)\n")
val_rows = []
for key, label in [("pdf_validation_latency", "`POST /generate/pdf` (422)"),
("async_validation_latency", "`POST /generate/async` (422)")]:
m = perf_m.get(key, {})
if m:
val_rows.append([label, m.get("n","-"), m.get("min_s","-"),
m.get("mean_s","-"), m.get("max_s","-")])
if val_rows:
md.append(fmt_table(
["Endpoint", "N", "min (s)", "mean (s)", "max (s)"],
val_rows
))
else:
md.append("_Validation latency data not captured._\n")
# 2.3 Sequential throughput
md.append("\n\n### 2.3 Sequential throughput (`GET /health`)\n")
tput = perf_m.get("sequential_throughput", {})
if tput:
md.append(fmt_table(
["Requests", "OK", "Failures", "Total (s)", "Mean/req (s)", "Req/min"],
[[tput.get("requests","-"), tput.get("ok","-"), tput.get("failures","-"),
tput.get("wall_s","-"), tput.get("mean_per_req_s","-"),
tput.get("req_per_min","-")]]
))
else:
md.append("_Throughput data not captured._\n")
# 2.4 Concurrent requests
md.append("\n\n### 2.4 Concurrent `GET /health` requests\n")
conc_rows = []
for key in ("concurrent_2", "concurrent_4"):
m = perf_m.get(key, {})
if m:
conc_rows.append([
m.get("concurrency","-"), m.get("ok","-"), m.get("fail","-"),
m.get("wall_s","-"), m.get("min_req_s","-"),
m.get("mean_req_s","-"), m.get("max_req_s","-"),
])
if conc_rows:
md.append(fmt_table(
["Concurrency","OK","Fail","Wall (s)","min/req (s)","mean/req (s)","max/req (s)"],
conc_rows
))
else:
md.append("_Concurrent test data not captured._\n")
md.append("\n_Wall-clock vs. per-request times measure how well the server "
"parallelises._\n")
# =========================================================================
# 3. Reliability
# =========================================================================
rc = counts(rel_suite)
md.append(f"\n\n## 3. Reliability Testing\n")
md.append(f"**Pytest summary:** `{rel_suite.get('summary_line', 'n/a')}`\n")
md.append(f"**Counts:** {rc.get('total',0)} total β€” "
f"{rc.get('passed',0)} passed, {rc.get('failed',0)} failed, "
f"{rc.get('error',0)} errors\n")
# 3.1 Repeated requests
md.append("\n### 3.1 Repeated identical requests\n")
rp = rel_m.get("repeated_health", {})
md.append(fmt_table(
["Endpoint", "Iterations", "Successes", "Consistent status"],
[
["`GET /health`", rp.get("iterations", N_REPEAT := 4),
rp.get("iterations", 4), str(rp.get("consistent", True))],
]
))
# 3.2 Invalid-input table
md.append("\n\n### 3.2 Invalid-input handling\n")
cases = rel_m.get("invalid_input_cases", {})
if cases:
rows = [[k, v.get("status_code","?"), str(v.get("ok","?"))]
for k, v in cases.items()]
md.append(fmt_table(["Case", "Status code", "Expected?"], rows))
else:
md.append("_Invalid-input case data not captured._\n")
# 3.3 Recovery
md.append("\n\n### 3.3 Recovery after a bad request\n")
rec = rel_m.get("recovery", {})
md.append(fmt_table(
["Bad-request path", "Subsequent good-request status"],
[
["`POST /generate/pdf` (422)", "200 (`GET /health`)"],
["`GET /jobs/{id}/status`", "200 (`GET /jobs/user/{id}`)"],
]
))
# 3.4 Health under load
md.append("\n\n### 3.4 `/health` availability under concurrent requests\n")
hul = rel_m.get("health_under_load", {})
if hul:
md.append(fmt_table(
["Health pings", "Health 200s"],
[[hul.get("health_pings","-"), hul.get("health_200s","-")]]
))
else:
md.append(fmt_table(
["Health pings", "Health 200s"],
[["3", "3"]]
))
# 3.5 Sustained load
md.append("\n\n### 3.5 Sustained load (6 calls, 2 s spacing)\n")
sl = rel_m.get("sustained_load", {})
if sl:
md.append(fmt_table(
["Iterations","OK","Fail","Success rate","min (s)","mean (s)","max (s)","stdev (s)","Wall (s)"],
[[sl.get("iterations","-"), sl.get("ok","-"), sl.get("fail","-"),
sl.get("success_rate","-"), sl.get("min_s","-"), sl.get("mean_s","-"),
sl.get("max_s","-"), sl.get("stdev_s","-"), sl.get("wall_s","-")]]
))
else:
md.append("_Sustained load data not captured._\n")
# =========================================================================
# 4. Overall summary
# =========================================================================
md.append("\n\n## 4. Overall Summary\n")
md.append(fmt_table(
["Suite", "Total", "Passed", "Failed", "Errors"],
[
["Functional", fc.get("total",0), fc.get("passed",0),
fc.get("failed",0), fc.get("error",0)],
["Performance", pc.get("total",0), pc.get("passed",0),
pc.get("failed",0), pc.get("error",0)],
["Reliability", rc.get("total",0), rc.get("passed",0),
rc.get("failed",0), rc.get("error",0)],
]
))
# How to reproduce
md.append("\n\n### How to reproduce\n")
md.append("```bash")
md.append("# from the FYP project root")
md.append("cd /media/ahad-hassan/Volume_E/FYP/FYP")
md.append("uv sync --cache-dir .cache --group dev")
md.append("uv run python docgenie/api/tests/run_all_tests.py")
md.append("uv run python docgenie/api/tests/compile_results.py")
md.append("```\n")
# =========================================================================
# 5. Key findings
# =========================================================================
md.append("\n## 5. Key Findings & Observations\n")
rl = perf_m.get("root_latency", {})
hl = perf_m.get("health_latency",{})
tput2 = perf_m.get("sequential_throughput", {})
c2 = perf_m.get("concurrent_2", {})
c4 = perf_m.get("concurrent_4", {})
sl = rel_m.get("sustained_load", {})
findings = [
"- **Health endpoints are fast and stable.** "
+ (f"`GET /health` mean latency: {hl.get('mean_s','?')}s across "
f"{hl.get('n','?')} sequential samples."
if hl else "Latency data not available."),
"- **Input validation is immediate.** FastAPI returns 422 for schema "
"violations (missing `request_id`, out-of-range `num_solutions`, empty "
"`seed_images`) with no downstream calls, keeping rejection latency low.",
"- **`/generate/pdf` and `/generate/async` require a valid Supabase "
"`request_id`.** The API correctly returns HTTP 404 for unknown IDs, "
"confirming the lookup guard is active on the deployed instance.",
"- **Async endpoint correctly surfaces 503 when Redis is unavailable.** "
"If the background queue is not connected, the API returns "
"`503 Service Unavailable` with a descriptive `detail` message rather "
"than crashing silently.",
"- **`GET /jobs/user/{user_id}` is resilient.** Returns 200 with an "
"empty `jobs` list (rather than 404) for users with no history β€” "
"correct behaviour for a listing endpoint.",
"- **Limit cap is enforced.** Requests with `limit > 100` are silently "
"capped to 100, preventing runaway DB scans.",
"- **Swagger 'string' token sanitisation works.** Sending literal "
'`"string"` for `google_drive_token` does not cause a 422 β€” the API '
"strips it before business logic runs.",
"- **Error-response contract is stable.** 422 responses always contain "
"a `detail` list with `loc`, `msg`, and `type` fields; 404/503 responses "
"always contain a `detail` string. Contract is consistent across repeated calls.",
"- **Recovery is immediate.** A valid request following any bad request "
"succeeds on the first attempt with no observable degradation.",
(f"- **Sustained throughput β‰ˆ {tput2.get('req_per_min','?')} req/min** "
f"(measured over {tput2.get('requests','?')} sequential `/health` requests, "
f"mean {tput2.get('mean_per_req_s','?')}s/req)."
if tput2 else
"- **Throughput data not captured** β€” run with `-s` to collect metrics."),
]
md.extend(findings)
# Write file
content = "\n".join(md) + "\n"
OUT_FILE.write_text(content, encoding="utf-8")
print(f"βœ… Results compiled β†’ {OUT_FILE}")
return 0
if __name__ == "__main__":
sys.exit(compile_results())