ROCmPort-AI / app.py
tazwarrrr's picture
Fix encoding: remove all non-ASCII chars from app.py
fcea1da
"""
ROCmPort AI - Gradio Space entry point
Calls the deployed FastAPI backend (Render) and streams agent events.
"""
import gradio as gr
import httpx
import json
BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com"
AGENT_LABELS = {
"analyzer": "Analyzer",
"translator": "Translator",
"optimizer": "Optimizer",
"tester": "Tester",
"coordinator": "Coordinator",
}
STATUS_LABELS = {
"waiting": "[waiting]",
"running": "[running]",
"done": "[done]",
"failed": "[FAILED]",
"retrying": "[retrying]",
}
EXAMPLE_REDUCTION = """\
__global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
float mySum = (i < n) ? g_idata[i] : 0;
if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s];
__syncthreads();
}
// DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64)
if (tid < 32) {
volatile float* vsmem = sdata;
vsmem[tid] = mySum = mySum + vsmem[tid + 32];
vsmem[tid] = mySum = mySum + vsmem[tid + 16];
vsmem[tid] = mySum = mySum + vsmem[tid + 8];
vsmem[tid] = mySum = mySum + vsmem[tid + 4];
vsmem[tid] = mySum = mySum + vsmem[tid + 2];
vsmem[tid] = mySum = mySum + vsmem[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}"""
EXAMPLE_VECTOR_ADD = """\
__global__ void vectorAdd(const float *A, const float *B, float *C, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
C[i] = A[i] + B[i];
// Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64)
if (threadIdx.x % 32 == 0) {
printf("Warp leader: %d\n", threadIdx.x / 32);
}
}
}"""
EXAMPLE_MATMUL = """\
__global__ void matmul(float *A, float *B, float *C, int N) {
__shared__ float As[32][32];
__shared__ float Bs[32][32];
int row = blockIdx.y * 32 + threadIdx.y;
int col = blockIdx.x * 32 + threadIdx.x;
float sum = 0.0f;
for (int k = 0; k < N / 32; k++) {
As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x];
Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col];
__syncthreads();
for (int n = 0; n < 32; n++)
sum += As[threadIdx.y][n] * Bs[n][threadIdx.x];
__syncthreads();
}
C[row * N + col] = sum;
}"""
def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool):
"""Generator: streams agent events and yields (log_markdown, hip_code)."""
if not cuda_code or len(cuda_code.strip()) < 10:
yield "Please provide CUDA kernel code (at least 10 characters).", ""
return
kernel_name = kernel_name.strip() or "custom"
log_lines: list[str] = []
hip_code = ""
payload = {
"cuda_code": cuda_code,
"kernel_name": kernel_name,
"simple_mode": bool(simple_mode),
}
log_lines.append("**Connecting to ROCmPort AI backend...**")
yield "\n\n".join(log_lines), hip_code
try:
with httpx.Client(timeout=180.0) as client:
with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp:
resp.raise_for_status()
for line in resp.iter_lines():
if not line:
continue
if not line.startswith("data: "):
continue
data = line[6:]
if data.strip() == "[DONE]":
break
try:
event = json.loads(data)
except json.JSONDecodeError:
continue
agent = event.get("agent", "system")
status = event.get("status", "running")
message = event.get("message", "")
detail = event.get("detail") or ""
label = AGENT_LABELS.get(agent, agent.capitalize())
s_label = STATUS_LABELS.get(status, status)
log_lines.append(f"**{label}** {s_label} -- {message}")
if status == "done" and detail:
try:
detail_json = json.loads(detail)
candidate = (
detail_json.get("hip_code")
or detail_json.get("optimized_code")
or detail_json.get("translated_code")
or ""
)
if candidate:
hip_code = candidate
except (json.JSONDecodeError, AttributeError):
pass
yield "\n\n".join(log_lines), hip_code
except httpx.ConnectError:
log_lines.append(
"**Could not connect to backend.**\n\n"
"> The server may be cold-starting -- please wait ~30 s and retry."
)
yield "\n\n".join(log_lines), hip_code
return
except httpx.TimeoutException:
log_lines.append("**Request timed out.** The pipeline may still be running -- try again shortly.")
yield "\n\n".join(log_lines), hip_code
return
except httpx.HTTPStatusError as exc:
log_lines.append(f"**HTTP {exc.response.status_code}**: {exc.response.text[:300]}")
yield "\n\n".join(log_lines), hip_code
return
except Exception as exc: # noqa: BLE001
log_lines.append(f"**Unexpected error**: {exc}")
yield "\n\n".join(log_lines), hip_code
return
if not hip_code:
log_lines.append("\nPipeline finished but no HIP code was extracted. Check agent logs above.")
else:
log_lines.append("\n**Migration complete.** HIP code is shown on the right.")
yield "\n\n".join(log_lines), hip_code
CSS = (
".panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; } "
"footer { display: none !important; }"
)
with gr.Blocks(title="ROCmPort AI -- CUDA to ROCm Migration") as demo:
gr.Markdown(
"""# ROCmPort AI
### Agentic CUDA to ROCm/HIP migration with wavefront-64 bug detection
> **Backend API**: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) |
> **GitHub**: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI)
`hipify-clang` translates CUDA API calls mechanically -- it **cannot** detect that `if (tid < 32)` in a
warp-level reduction silently skips lanes 32-63 on AMD wavefront-64.
The code compiles, the output is wrong, no errors. **ROCmPort AI catches this before execution.**
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input", elem_classes="panel-header")
cuda_input = gr.Code(
label="CUDA Kernel Code",
language="cpp",
lines=22,
value=EXAMPLE_REDUCTION,
)
with gr.Row():
kernel_name = gr.Textbox(
label="Kernel Name",
value="reduction",
placeholder="e.g. reduction, matmul, vector_add",
scale=2,
)
simple_mode = gr.Checkbox(
label="Explain Like I am 5",
value=False,
scale=1,
)
with gr.Row():
port_btn = gr.Button("Port to ROCm", variant="primary", scale=3)
clear_btn = gr.Button("Clear", scale=1)
gr.Examples(
examples=[
[EXAMPLE_REDUCTION, "reduction", False],
[EXAMPLE_VECTOR_ADD, "vector_add", False],
[EXAMPLE_MATMUL, "matmul", False],
],
inputs=[cuda_input, kernel_name, simple_mode],
label="Demo Kernels (pre-loaded with intentional AMD bugs)",
)
with gr.Column(scale=1):
gr.Markdown("### Output", elem_classes="panel-header")
log_output = gr.Markdown(
value="*Agent steps will appear here once you click **Port to ROCm**.*",
label="Agent Pipeline Log",
)
hip_output = gr.Code(
label="Translated and Optimized HIP Code",
language="cpp",
lines=18,
)
gr.Markdown(
"""
---
### How the pipeline works
| Agent | Role |
|-------|------|
| **Analyzer** | Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout |
| **Translator** | Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect |
| **Tester** | Verifies compilation with `hipcc` and checks output correctness |
| **Optimizer** | Proposes MI300X-specific optimisations; re-tested against baseline |
| **Coordinator** | Orchestrates the loop; retries up to 3x if the optimised output regresses |
### The key bug: warp-size assumption
```c
// NVIDIA (warpSize = 32) -- silently WRONG on AMD
if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... }
// AMD-correct (wavefront = 64)
if (tid < 64) {
vsmem[tid] += vsmem[tid + 32];
if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... }
}
```
### Benchmark highlights (MI300X, ROCm 7.0)
| Kernel | Result |
|--------|--------|
| matrix_multiply 512x512 | 2.91x speedup over baseline HIP |
| vector_add 32M elements | ~3918 GB/s (~74% of MI300X peak) |
| reduction 16M elements | correctness PASS after wavefront-64 fix |
> Source: `docs/benchmark_runs/` -- real `rocprof` CSV output, May 2026.
> Results vary with kernel complexity; these figures are not guaranteed on every input.
"""
)
port_btn.click(
fn=port_kernel,
inputs=[cuda_input, kernel_name, simple_mode],
outputs=[log_output, hip_output],
)
clear_btn.click(
fn=lambda: ("*Agent steps will appear here once you click **Port to ROCm**.*", ""),
outputs=[log_output, hip_output],
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Default(primary_hue="orange"), css=CSS)