Spaces:

lablab-ai-amd-developer-hackathon
/

ROCmPort-AI

Running

App Files Files Community

ROCmPort-AI / app.py

tazwarrrr

Fix encoding: remove all non-ASCII chars from app.py

fcea1da 1 day ago

raw

history blame contribute delete

10.5 kB

	"""
	ROCmPort AI - Gradio Space entry point
	Calls the deployed FastAPI backend (Render) and streams agent events.
	"""

	import gradio as gr
	import httpx
	import json

	BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com"

	AGENT_LABELS = {
	"analyzer": "Analyzer",
	"translator": "Translator",
	"optimizer": "Optimizer",
	"tester": "Tester",
	"coordinator": "Coordinator",
	}

	STATUS_LABELS = {
	"waiting": "[waiting]",
	"running": "[running]",
	"done": "[done]",
	"failed": "[FAILED]",
	"retrying": "[retrying]",
	}

	EXAMPLE_REDUCTION = """\
	__global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) {
	extern __shared__ float sdata[];
	unsigned int tid = threadIdx.x;
	unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
	float mySum = (i < n) ? g_idata[i] : 0;
	if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
	sdata[tid] = mySum;
	__syncthreads();
	for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
	if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s];
	__syncthreads();
	}
	// DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64)
	if (tid < 32) {
	volatile float* vsmem = sdata;
	vsmem[tid] = mySum = mySum + vsmem[tid + 32];
	vsmem[tid] = mySum = mySum + vsmem[tid + 16];
	vsmem[tid] = mySum = mySum + vsmem[tid + 8];
	vsmem[tid] = mySum = mySum + vsmem[tid + 4];
	vsmem[tid] = mySum = mySum + vsmem[tid + 2];
	vsmem[tid] = mySum = mySum + vsmem[tid + 1];
	}
	if (tid == 0) g_odata[blockIdx.x] = sdata[0];
	}"""

	EXAMPLE_VECTOR_ADD = """\
	__global__ void vectorAdd(const float A, const float B, float *C, int n) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < n) {
	C[i] = A[i] + B[i];
	// Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64)
	if (threadIdx.x % 32 == 0) {
	printf("Warp leader: %d\n", threadIdx.x / 32);
	}
	}
	}"""

	EXAMPLE_MATMUL = """\
	__global__ void matmul(float A, float B, float *C, int N) {
	__shared__ float As[32][32];
	__shared__ float Bs[32][32];
	int row = blockIdx.y * 32 + threadIdx.y;
	int col = blockIdx.x * 32 + threadIdx.x;
	float sum = 0.0f;
	for (int k = 0; k < N / 32; k++) {
	As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x];
	Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col];
	__syncthreads();
	for (int n = 0; n < 32; n++)
	sum += As[threadIdx.y][n] * Bs[n][threadIdx.x];
	__syncthreads();
	}
	C[row * N + col] = sum;
	}"""


	def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool):
	"""Generator: streams agent events and yields (log_markdown, hip_code)."""
	if not cuda_code or len(cuda_code.strip()) < 10:
	yield "Please provide CUDA kernel code (at least 10 characters).", ""
	return

	kernel_name = kernel_name.strip() or "custom"
	log_lines: list[str] = []
	hip_code = ""

	payload = {
	"cuda_code": cuda_code,
	"kernel_name": kernel_name,
	"simple_mode": bool(simple_mode),
	}

	log_lines.append("Connecting to ROCmPort AI backend...")
	yield "\n\n".join(log_lines), hip_code

	try:
	with httpx.Client(timeout=180.0) as client:
	with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp:
	resp.raise_for_status()

	for line in resp.iter_lines():
	if not line:
	continue
	if not line.startswith("data: "):
	continue

	data = line[6:]
	if data.strip() == "[DONE]":
	break

	try:
	event = json.loads(data)
	except json.JSONDecodeError:
	continue

	agent = event.get("agent", "system")
	status = event.get("status", "running")
	message = event.get("message", "")
	detail = event.get("detail") or ""

	label = AGENT_LABELS.get(agent, agent.capitalize())
	s_label = STATUS_LABELS.get(status, status)

	log_lines.append(f"{label} {s_label} -- {message}")

	if status == "done" and detail:
	try:
	detail_json = json.loads(detail)
	candidate = (
	detail_json.get("hip_code")
	or detail_json.get("optimized_code")
	or detail_json.get("translated_code")
	or ""
	)
	if candidate:
	hip_code = candidate
	except (json.JSONDecodeError, AttributeError):
	pass

	yield "\n\n".join(log_lines), hip_code

	except httpx.ConnectError:
	log_lines.append(
	"Could not connect to backend.\n\n"
	"> The server may be cold-starting -- please wait ~30 s and retry."
	)
	yield "\n\n".join(log_lines), hip_code
	return
	except httpx.TimeoutException:
	log_lines.append("Request timed out. The pipeline may still be running -- try again shortly.")
	yield "\n\n".join(log_lines), hip_code
	return
	except httpx.HTTPStatusError as exc:
	log_lines.append(f"HTTP {exc.response.status_code}: {exc.response.text[:300]}")
	yield "\n\n".join(log_lines), hip_code
	return
	except Exception as exc: # noqa: BLE001
	log_lines.append(f"Unexpected error: {exc}")
	yield "\n\n".join(log_lines), hip_code
	return

	if not hip_code:
	log_lines.append("\nPipeline finished but no HIP code was extracted. Check agent logs above.")
	else:
	log_lines.append("\nMigration complete. HIP code is shown on the right.")

	yield "\n\n".join(log_lines), hip_code


	CSS = (
	".panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; } "
	"footer { display: none !important; }"
	)

	with gr.Blocks(title="ROCmPort AI -- CUDA to ROCm Migration") as demo:

	gr.Markdown(
	"""# ROCmPort AI
	### Agentic CUDA to ROCm/HIP migration with wavefront-64 bug detection

	> Backend API: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) \|
	> GitHub: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI)

	`hipify-clang` translates CUDA API calls mechanically -- it cannot detect that `if (tid < 32)` in a
	warp-level reduction silently skips lanes 32-63 on AMD wavefront-64.
	The code compiles, the output is wrong, no errors. ROCmPort AI catches this before execution.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Input", elem_classes="panel-header")
	cuda_input = gr.Code(
	label="CUDA Kernel Code",
	language="cpp",
	lines=22,
	value=EXAMPLE_REDUCTION,
	)
	with gr.Row():
	kernel_name = gr.Textbox(
	label="Kernel Name",
	value="reduction",
	placeholder="e.g. reduction, matmul, vector_add",
	scale=2,
	)
	simple_mode = gr.Checkbox(
	label="Explain Like I am 5",
	value=False,
	scale=1,
	)
	with gr.Row():
	port_btn = gr.Button("Port to ROCm", variant="primary", scale=3)
	clear_btn = gr.Button("Clear", scale=1)

	gr.Examples(
	examples=[
	[EXAMPLE_REDUCTION, "reduction", False],
	[EXAMPLE_VECTOR_ADD, "vector_add", False],
	[EXAMPLE_MATMUL, "matmul", False],
	],
	inputs=[cuda_input, kernel_name, simple_mode],
	label="Demo Kernels (pre-loaded with intentional AMD bugs)",
	)

	with gr.Column(scale=1):
	gr.Markdown("### Output", elem_classes="panel-header")
	log_output = gr.Markdown(
	value="Agent steps will appear here once you click Port to ROCm.",
	label="Agent Pipeline Log",
	)
	hip_output = gr.Code(
	label="Translated and Optimized HIP Code",
	language="cpp",
	lines=18,
	)

	gr.Markdown(
	"""
	---
	### How the pipeline works

	\| Agent \| Role \|
	\|-------\|------\|
	\| Analyzer \| Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout \|
	\| Translator \| Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect \|
	\| Tester \| Verifies compilation with `hipcc` and checks output correctness \|
	\| Optimizer \| Proposes MI300X-specific optimisations; re-tested against baseline \|
	\| Coordinator \| Orchestrates the loop; retries up to 3x if the optimised output regresses \|

	### The key bug: warp-size assumption

	```c
	// NVIDIA (warpSize = 32) -- silently WRONG on AMD
	if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... }

	// AMD-correct (wavefront = 64)
	if (tid < 64) {
	vsmem[tid] += vsmem[tid + 32];
	if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... }
	}
	```

	### Benchmark highlights (MI300X, ROCm 7.0)

	\| Kernel \| Result \|
	\|--------\|--------\|
	\| matrix_multiply 512x512 \| 2.91x speedup over baseline HIP \|
	\| vector_add 32M elements \| ~3918 GB/s (~74% of MI300X peak) \|
	\| reduction 16M elements \| correctness PASS after wavefront-64 fix \|

	> Source: `docs/benchmark_runs/` -- real `rocprof` CSV output, May 2026.
	> Results vary with kernel complexity; these figures are not guaranteed on every input.
	"""
	)

	port_btn.click(
	fn=port_kernel,
	inputs=[cuda_input, kernel_name, simple_mode],
	outputs=[log_output, hip_output],
	)

	clear_btn.click(
	fn=lambda: ("Agent steps will appear here once you click Port to ROCm.", ""),
	outputs=[log_output, hip_output],
	)

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Default(primary_hue="orange"), css=CSS)