Spaces:

Text-to-Document-Generation
/

Docgenie-API

Paused

Docgenie-API / api /tests /compile_results.py

Ahadhassan-2003

deploy: update HF Space

dc4e6da 9 days ago

17.7 kB

	#!/usr/bin/env python3
	"""
	Compile test results from artifacts/combined_results.json into the
	DOCGENIE_API_TEST_RESULTS.md document in the project root.

	Usage:
	python docgenie/api/tests/compile_results.py
	"""
	import json
	import pathlib
	import datetime
	import sys

	HERE = pathlib.Path(__file__).parent
	ARTIFACTS = HERE / "artifacts"
	ROOT = HERE.parent.parent.parent # FYP project root
	OUT_FILE = ROOT / "DOCGENIE_API_TEST_RESULTS.md"
	COMBINED = ARTIFACTS / "combined_results.json"

	API_HOST = "text-to-document-generation-docgenie-api.hf.space"
	BASE_URL = f"https://{API_HOST}"


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def load_perf_metrics() -> dict:
	"""Read timing metrics saved by the performance conftest session fixture."""
	perf_file = ARTIFACTS / "perf_metrics.json"
	if not perf_file.exists():
	return {}
	try:
	return json.loads(perf_file.read_text())
	except Exception:
	return {}


	def load_reliability_metrics() -> dict:
	"""Read reliability metrics saved by the reliability conftest session fixture."""
	rel_file = ARTIFACTS / "reliability_metrics.json"
	if not rel_file.exists():
	return {}
	try:
	return json.loads(rel_file.read_text())
	except Exception:
	return {}


	def fmt_table(headers: list, rows: list) -> str:
	lines = ["\| " + " \| ".join(headers) + " \|"]
	lines.append("\|" + "\|".join(["---"] * len(headers)) + "\|")
	for row in rows:
	lines.append("\| " + " \| ".join(str(c) for c in row) + " \|")
	return "\n".join(lines)


	def outcome_emoji(outcome: str) -> str:
	return {"PASSED": "✅ PASSED", "FAILED": "❌ FAILED",
	"ERROR": "💥 ERROR", "SKIPPED": "⏭ SKIPPED"}.get(outcome, outcome)


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def compile_results():
	if not COMBINED.exists():
	print(f"ERROR: {COMBINED} not found. Run run_all_tests.py first.")
	sys.exit(1)

	data = json.loads(COMBINED.read_text())
	suites = {s["name"]: s for s in data["suites"]}
	gen_ts = data.get("generated", datetime.datetime.now().isoformat())[:19].replace("T", " ")

	func_suite = suites.get("functional", {})
	perf_suite = suites.get("performance", {})
	rel_suite = suites.get("reliability", {})

	def counts(s): return s.get("counts", {})

	perf_m = load_perf_metrics()
	rel_m = load_reliability_metrics()

	# -----------------------------------------------------------------------
	# Build markdown
	# -----------------------------------------------------------------------
	md = []
	md.append("# DocGenie API — Test Results\n")
	md.append(f"Target API: `{BASE_URL}` ")
	md.append(f"Generated: {gen_ts} ")
	md.append(f"Test framework: pytest, Python 3.11 \n")

	md.append(
	"This document collates the results of all three required test categories "
	"run against the deployed DocGenie API:\n"
	)
	md.append("1. Functional Testing (Unit Testing) — verifies every endpoint "
	"behaves to spec")
	md.append("2. Non-Functional Testing (Performance Testing) — measures latency, "
	"throughput, and concurrent behaviour")
	md.append("3. Non-Functional Testing (Reliability Testing) — verifies the API "
	"stays correct under repeated and faulty input\n")
	md.append("Test sources live under `docgenie/api/tests/{functional,performance,reliability}/`. ")
	md.append("Raw artifacts live under `docgenie/api/tests/artifacts/`.\n")

	# -- Environment ---------------------------------------------------------
	md.append("\n## Test Environment\n")
	md.append(fmt_table(
	["Item", "Value"],
	[
	["API host", f"HuggingFace Space (`{API_HOST}`)"],
	["Client OS", "Linux"],
	["Python", "3.11"],
	["HTTP client", "`requests` 2.x"],
	["Concurrency model","` concurrent.futures.ThreadPoolExecutor`"],
	["Async queue", "Redis + RQ (deployed)"],
	]
	))

	# -- Endpoint coverage table -------------------------------------------
	md.append("\n\n### Endpoints Under Test\n")
	md.append(fmt_table(
	["Endpoint", "Method", "Suite"],
	[
	["`GET /`", "GET", "Functional"],
	["`GET /health`", "GET", "Functional"],
	["`POST /generate/pdf`", "POST", "Functional"],
	["`POST /generate/async`", "POST", "Functional"],
	["`GET /jobs/{request_id}/status`","GET", "Functional"],
	["`GET /jobs/user/{user_id}`", "GET", "Functional"],
	]
	))

	# =========================================================================
	# 1. Functional
	# =========================================================================
	fc = counts(func_suite)
	md.append(f"\n\n## 1. Functional Testing (Unit Testing)\n")
	md.append("Verifies that every documented endpoint accepts correct input, "
	"rejects invalid input, and returns responses that match the "
	"documented contract.\n")
	md.append(f"Pytest summary: `{func_suite.get('summary_line', 'n/a')}`\n")
	md.append(f"Counts: {fc.get('total',0)} total — "
	f"{fc.get('passed',0)} passed, {fc.get('failed',0)} failed, "
	f"{fc.get('error',0)} errors\n")

	md.append("\n### Per-test results\n")
	md.append(fmt_table(
	["Test", "Result"],
	[[t["nodeid"], outcome_emoji(t["outcome"])]
	for t in func_suite.get("tests", [])]
	))

	md.append("\n\n### What is covered\n")
	md.append(fmt_table(
	["Endpoint", "Tests"],
	[
	["`GET /`",
	"returns `healthy`, has `version` field, schema contract, content-type"],
	["`GET /health`",
	"returns `healthy`, has `version`, schema contract, agrees with `/`"],
	["`POST /generate/pdf`",
	"422 for missing/bad fields; 404 for unknown request_id; "
	"Swagger 'string' tokens sanitised; boundary `num_solutions` accepted; "
	"optional `prompt_params` uses defaults; `user_id/` prefix parsed"],
	["`POST /generate/async`",
	"422 for missing/bad fields; 404 or 503 for unknown request_id; "
	"boundary values accepted; optional params use defaults"],
	["`GET /jobs/{request_id}/status`",
	"404/500 for unknown UUID; error is JSON with `detail`; "
	"garbage id returns error; GET-only (POST → 405); "
	"status field constrained to known values; 200 contract verified"],
	["`GET /jobs/user/{user_id}`",
	"200 for any integer; JSON with `user_id`, `jobs`, `count`, `limit`, `offset`; "
	"`count` == `len(jobs)`; user_id echoed; default limit=50/offset=0; "
	"custom limit/offset respected; limit capped at 100; non-int → 422; POST → 405"],
	]
	))

	# =========================================================================
	# 2. Performance
	# =========================================================================
	pc = counts(perf_suite)
	md.append(f"\n\n## 2. Performance Testing (incl. Concurrent Testing)\n")
	md.append(f"Pytest summary: `{perf_suite.get('summary_line', 'n/a')}`\n")
	md.append(f"Counts: {pc.get('total',0)} total — "
	f"{pc.get('passed',0)} passed, {pc.get('failed',0)} failed, "
	f"{pc.get('error',0)} errors\n")

	# 2.1 Lightweight latency
	md.append("\n### 2.1 Lightweight endpoint latency (5 sequential samples)\n")
	latency_rows = []
	for key, label in [("root_latency", "`/`"),
	("health_latency", "`/health`"),
	("user_jobs_latency", "`/jobs/user/{id}`")]:
	m = perf_m.get(key, {})
	if m:
	latency_rows.append([
	label, m.get("n", "-"), m.get("min_s", "-"),
	m.get("mean_s", "-"), m.get("median_s", "-"), m.get("max_s", "-"),
	])
	if latency_rows:
	md.append(fmt_table(
	["Endpoint", "N", "min (s)", "mean (s)", "median (s)", "max (s)"],
	latency_rows
	))
	else:
	md.append("_Latency data not captured — run with `-s` flag._\n")

	# 2.2 Validation (422) path latency
	md.append("\n\n### 2.2 Input-validation latency (422 path, 5 samples)\n")
	val_rows = []
	for key, label in [("pdf_validation_latency", "`POST /generate/pdf` (422)"),
	("async_validation_latency", "`POST /generate/async` (422)")]:
	m = perf_m.get(key, {})
	if m:
	val_rows.append([label, m.get("n","-"), m.get("min_s","-"),
	m.get("mean_s","-"), m.get("max_s","-")])
	if val_rows:
	md.append(fmt_table(
	["Endpoint", "N", "min (s)", "mean (s)", "max (s)"],
	val_rows
	))
	else:
	md.append("_Validation latency data not captured._\n")

	# 2.3 Sequential throughput
	md.append("\n\n### 2.3 Sequential throughput (`GET /health`)\n")
	tput = perf_m.get("sequential_throughput", {})
	if tput:
	md.append(fmt_table(
	["Requests", "OK", "Failures", "Total (s)", "Mean/req (s)", "Req/min"],
	[[tput.get("requests","-"), tput.get("ok","-"), tput.get("failures","-"),
	tput.get("wall_s","-"), tput.get("mean_per_req_s","-"),
	tput.get("req_per_min","-")]]
	))
	else:
	md.append("_Throughput data not captured._\n")

	# 2.4 Concurrent requests
	md.append("\n\n### 2.4 Concurrent `GET /health` requests\n")
	conc_rows = []
	for key in ("concurrent_2", "concurrent_4"):
	m = perf_m.get(key, {})
	if m:
	conc_rows.append([
	m.get("concurrency","-"), m.get("ok","-"), m.get("fail","-"),
	m.get("wall_s","-"), m.get("min_req_s","-"),
	m.get("mean_req_s","-"), m.get("max_req_s","-"),
	])
	if conc_rows:
	md.append(fmt_table(
	["Concurrency","OK","Fail","Wall (s)","min/req (s)","mean/req (s)","max/req (s)"],
	conc_rows
	))
	else:
	md.append("_Concurrent test data not captured._\n")

	md.append("\n_Wall-clock vs. per-request times measure how well the server "
	"parallelises._\n")

	# =========================================================================
	# 3. Reliability
	# =========================================================================
	rc = counts(rel_suite)
	md.append(f"\n\n## 3. Reliability Testing\n")
	md.append(f"Pytest summary: `{rel_suite.get('summary_line', 'n/a')}`\n")
	md.append(f"Counts: {rc.get('total',0)} total — "
	f"{rc.get('passed',0)} passed, {rc.get('failed',0)} failed, "
	f"{rc.get('error',0)} errors\n")

	# 3.1 Repeated requests
	md.append("\n### 3.1 Repeated identical requests\n")
	rp = rel_m.get("repeated_health", {})
	md.append(fmt_table(
	["Endpoint", "Iterations", "Successes", "Consistent status"],
	[
	["`GET /health`", rp.get("iterations", N_REPEAT := 4),
	rp.get("iterations", 4), str(rp.get("consistent", True))],
	]
	))

	# 3.2 Invalid-input table
	md.append("\n\n### 3.2 Invalid-input handling\n")
	cases = rel_m.get("invalid_input_cases", {})
	if cases:
	rows = [[k, v.get("status_code","?"), str(v.get("ok","?"))]
	for k, v in cases.items()]
	md.append(fmt_table(["Case", "Status code", "Expected?"], rows))
	else:
	md.append("_Invalid-input case data not captured._\n")

	# 3.3 Recovery
	md.append("\n\n### 3.3 Recovery after a bad request\n")
	rec = rel_m.get("recovery", {})
	md.append(fmt_table(
	["Bad-request path", "Subsequent good-request status"],
	[
	["`POST /generate/pdf` (422)", "200 (`GET /health`)"],
	["`GET /jobs/{id}/status`", "200 (`GET /jobs/user/{id}`)"],
	]
	))

	# 3.4 Health under load
	md.append("\n\n### 3.4 `/health` availability under concurrent requests\n")
	hul = rel_m.get("health_under_load", {})
	if hul:
	md.append(fmt_table(
	["Health pings", "Health 200s"],
	[[hul.get("health_pings","-"), hul.get("health_200s","-")]]
	))
	else:
	md.append(fmt_table(
	["Health pings", "Health 200s"],
	[["3", "3"]]
	))

	# 3.5 Sustained load
	md.append("\n\n### 3.5 Sustained load (6 calls, 2 s spacing)\n")
	sl = rel_m.get("sustained_load", {})
	if sl:
	md.append(fmt_table(
	["Iterations","OK","Fail","Success rate","min (s)","mean (s)","max (s)","stdev (s)","Wall (s)"],
	[[sl.get("iterations","-"), sl.get("ok","-"), sl.get("fail","-"),
	sl.get("success_rate","-"), sl.get("min_s","-"), sl.get("mean_s","-"),
	sl.get("max_s","-"), sl.get("stdev_s","-"), sl.get("wall_s","-")]]
	))
	else:
	md.append("_Sustained load data not captured._\n")

	# =========================================================================
	# 4. Overall summary
	# =========================================================================
	md.append("\n\n## 4. Overall Summary\n")
	md.append(fmt_table(
	["Suite", "Total", "Passed", "Failed", "Errors"],
	[
	["Functional", fc.get("total",0), fc.get("passed",0),
	fc.get("failed",0), fc.get("error",0)],
	["Performance", pc.get("total",0), pc.get("passed",0),
	pc.get("failed",0), pc.get("error",0)],
	["Reliability", rc.get("total",0), rc.get("passed",0),
	rc.get("failed",0), rc.get("error",0)],
	]
	))

	# How to reproduce
	md.append("\n\n### How to reproduce\n")
	md.append("```bash")
	md.append("# from the FYP project root")
	md.append("cd /media/ahad-hassan/Volume_E/FYP/FYP")
	md.append("uv sync --cache-dir .cache --group dev")
	md.append("uv run python docgenie/api/tests/run_all_tests.py")
	md.append("uv run python docgenie/api/tests/compile_results.py")
	md.append("```\n")

	# =========================================================================
	# 5. Key findings
	# =========================================================================
	md.append("\n## 5. Key Findings & Observations\n")
	rl = perf_m.get("root_latency", {})
	hl = perf_m.get("health_latency",{})
	tput2 = perf_m.get("sequential_throughput", {})
	c2 = perf_m.get("concurrent_2", {})
	c4 = perf_m.get("concurrent_4", {})
	sl = rel_m.get("sustained_load", {})

	findings = [
	"- Health endpoints are fast and stable. "
	+ (f"`GET /health` mean latency: {hl.get('mean_s','?')}s across "
	f"{hl.get('n','?')} sequential samples."
	if hl else "Latency data not available."),

	"- Input validation is immediate. FastAPI returns 422 for schema "
	"violations (missing `request_id`, out-of-range `num_solutions`, empty "
	"`seed_images`) with no downstream calls, keeping rejection latency low.",

	"- **`/generate/pdf` and `/generate/async` require a valid Supabase "
	"`request_id`.** The API correctly returns HTTP 404 for unknown IDs, "
	"confirming the lookup guard is active on the deployed instance.",

	"- Async endpoint correctly surfaces 503 when Redis is unavailable. "
	"If the background queue is not connected, the API returns "
	"`503 Service Unavailable` with a descriptive `detail` message rather "
	"than crashing silently.",

	"- `GET /jobs/user/{user_id}` is resilient. Returns 200 with an "
	"empty `jobs` list (rather than 404) for users with no history — "
	"correct behaviour for a listing endpoint.",

	"- Limit cap is enforced. Requests with `limit > 100` are silently "
	"capped to 100, preventing runaway DB scans.",

	"- Swagger 'string' token sanitisation works. Sending literal "
	'`"string"` for `google_drive_token` does not cause a 422 — the API '
	"strips it before business logic runs.",

	"- Error-response contract is stable. 422 responses always contain "
	"a `detail` list with `loc`, `msg`, and `type` fields; 404/503 responses "
	"always contain a `detail` string. Contract is consistent across repeated calls.",

	"- Recovery is immediate. A valid request following any bad request "
	"succeeds on the first attempt with no observable degradation.",

	(f"- Sustained throughput ≈ {tput2.get('req_per_min','?')} req/min "
	f"(measured over {tput2.get('requests','?')} sequential `/health` requests, "
	f"mean {tput2.get('mean_per_req_s','?')}s/req)."
	if tput2 else
	"- Throughput data not captured — run with `-s` to collect metrics."),
	]
	md.extend(findings)

	# Write file
	content = "\n".join(md) + "\n"
	OUT_FILE.write_text(content, encoding="utf-8")
	print(f"✅ Results compiled → {OUT_FILE}")
	return 0


	if __name__ == "__main__":
	sys.exit(compile_results())