| """ |
| ROCmPort AI - Gradio Space entry point |
| Calls the deployed FastAPI backend (Render) and streams agent events. |
| """ |
|
|
| import gradio as gr |
| import httpx |
| import json |
|
|
| BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com" |
|
|
| AGENT_LABELS = { |
| "analyzer": "Analyzer", |
| "translator": "Translator", |
| "optimizer": "Optimizer", |
| "tester": "Tester", |
| "coordinator": "Coordinator", |
| } |
|
|
| STATUS_LABELS = { |
| "waiting": "[waiting]", |
| "running": "[running]", |
| "done": "[done]", |
| "failed": "[FAILED]", |
| "retrying": "[retrying]", |
| } |
|
|
| EXAMPLE_REDUCTION = """\ |
| __global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) { |
| extern __shared__ float sdata[]; |
| unsigned int tid = threadIdx.x; |
| unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; |
| float mySum = (i < n) ? g_idata[i] : 0; |
| if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x]; |
| sdata[tid] = mySum; |
| __syncthreads(); |
| for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) { |
| if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s]; |
| __syncthreads(); |
| } |
| // DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64) |
| if (tid < 32) { |
| volatile float* vsmem = sdata; |
| vsmem[tid] = mySum = mySum + vsmem[tid + 32]; |
| vsmem[tid] = mySum = mySum + vsmem[tid + 16]; |
| vsmem[tid] = mySum = mySum + vsmem[tid + 8]; |
| vsmem[tid] = mySum = mySum + vsmem[tid + 4]; |
| vsmem[tid] = mySum = mySum + vsmem[tid + 2]; |
| vsmem[tid] = mySum = mySum + vsmem[tid + 1]; |
| } |
| if (tid == 0) g_odata[blockIdx.x] = sdata[0]; |
| }""" |
|
|
| EXAMPLE_VECTOR_ADD = """\ |
| __global__ void vectorAdd(const float *A, const float *B, float *C, int n) { |
| int i = blockDim.x * blockIdx.x + threadIdx.x; |
| if (i < n) { |
| C[i] = A[i] + B[i]; |
| // Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64) |
| if (threadIdx.x % 32 == 0) { |
| printf("Warp leader: %d\n", threadIdx.x / 32); |
| } |
| } |
| }""" |
|
|
| EXAMPLE_MATMUL = """\ |
| __global__ void matmul(float *A, float *B, float *C, int N) { |
| __shared__ float As[32][32]; |
| __shared__ float Bs[32][32]; |
| int row = blockIdx.y * 32 + threadIdx.y; |
| int col = blockIdx.x * 32 + threadIdx.x; |
| float sum = 0.0f; |
| for (int k = 0; k < N / 32; k++) { |
| As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x]; |
| Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col]; |
| __syncthreads(); |
| for (int n = 0; n < 32; n++) |
| sum += As[threadIdx.y][n] * Bs[n][threadIdx.x]; |
| __syncthreads(); |
| } |
| C[row * N + col] = sum; |
| }""" |
|
|
|
|
| def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool): |
| """Generator: streams agent events and yields (log_markdown, hip_code).""" |
| if not cuda_code or len(cuda_code.strip()) < 10: |
| yield "Please provide CUDA kernel code (at least 10 characters).", "" |
| return |
|
|
| kernel_name = kernel_name.strip() or "custom" |
| log_lines: list[str] = [] |
| hip_code = "" |
|
|
| payload = { |
| "cuda_code": cuda_code, |
| "kernel_name": kernel_name, |
| "simple_mode": bool(simple_mode), |
| } |
|
|
| log_lines.append("**Connecting to ROCmPort AI backend...**") |
| yield "\n\n".join(log_lines), hip_code |
|
|
| try: |
| with httpx.Client(timeout=180.0) as client: |
| with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp: |
| resp.raise_for_status() |
|
|
| for line in resp.iter_lines(): |
| if not line: |
| continue |
| if not line.startswith("data: "): |
| continue |
|
|
| data = line[6:] |
| if data.strip() == "[DONE]": |
| break |
|
|
| try: |
| event = json.loads(data) |
| except json.JSONDecodeError: |
| continue |
|
|
| agent = event.get("agent", "system") |
| status = event.get("status", "running") |
| message = event.get("message", "") |
| detail = event.get("detail") or "" |
|
|
| label = AGENT_LABELS.get(agent, agent.capitalize()) |
| s_label = STATUS_LABELS.get(status, status) |
|
|
| log_lines.append(f"**{label}** {s_label} -- {message}") |
|
|
| if status == "done" and detail: |
| try: |
| detail_json = json.loads(detail) |
| candidate = ( |
| detail_json.get("hip_code") |
| or detail_json.get("optimized_code") |
| or detail_json.get("translated_code") |
| or "" |
| ) |
| if candidate: |
| hip_code = candidate |
| except (json.JSONDecodeError, AttributeError): |
| pass |
|
|
| yield "\n\n".join(log_lines), hip_code |
|
|
| except httpx.ConnectError: |
| log_lines.append( |
| "**Could not connect to backend.**\n\n" |
| "> The server may be cold-starting -- please wait ~30 s and retry." |
| ) |
| yield "\n\n".join(log_lines), hip_code |
| return |
| except httpx.TimeoutException: |
| log_lines.append("**Request timed out.** The pipeline may still be running -- try again shortly.") |
| yield "\n\n".join(log_lines), hip_code |
| return |
| except httpx.HTTPStatusError as exc: |
| log_lines.append(f"**HTTP {exc.response.status_code}**: {exc.response.text[:300]}") |
| yield "\n\n".join(log_lines), hip_code |
| return |
| except Exception as exc: |
| log_lines.append(f"**Unexpected error**: {exc}") |
| yield "\n\n".join(log_lines), hip_code |
| return |
|
|
| if not hip_code: |
| log_lines.append("\nPipeline finished but no HIP code was extracted. Check agent logs above.") |
| else: |
| log_lines.append("\n**Migration complete.** HIP code is shown on the right.") |
|
|
| yield "\n\n".join(log_lines), hip_code |
|
|
|
|
| CSS = ( |
| ".panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; } " |
| "footer { display: none !important; }" |
| ) |
|
|
| with gr.Blocks(title="ROCmPort AI -- CUDA to ROCm Migration") as demo: |
|
|
| gr.Markdown( |
| """# ROCmPort AI |
| ### Agentic CUDA to ROCm/HIP migration with wavefront-64 bug detection |
| |
| > **Backend API**: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) | |
| > **GitHub**: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI) |
| |
| `hipify-clang` translates CUDA API calls mechanically -- it **cannot** detect that `if (tid < 32)` in a |
| warp-level reduction silently skips lanes 32-63 on AMD wavefront-64. |
| The code compiles, the output is wrong, no errors. **ROCmPort AI catches this before execution.** |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### Input", elem_classes="panel-header") |
| cuda_input = gr.Code( |
| label="CUDA Kernel Code", |
| language="cpp", |
| lines=22, |
| value=EXAMPLE_REDUCTION, |
| ) |
| with gr.Row(): |
| kernel_name = gr.Textbox( |
| label="Kernel Name", |
| value="reduction", |
| placeholder="e.g. reduction, matmul, vector_add", |
| scale=2, |
| ) |
| simple_mode = gr.Checkbox( |
| label="Explain Like I am 5", |
| value=False, |
| scale=1, |
| ) |
| with gr.Row(): |
| port_btn = gr.Button("Port to ROCm", variant="primary", scale=3) |
| clear_btn = gr.Button("Clear", scale=1) |
|
|
| gr.Examples( |
| examples=[ |
| [EXAMPLE_REDUCTION, "reduction", False], |
| [EXAMPLE_VECTOR_ADD, "vector_add", False], |
| [EXAMPLE_MATMUL, "matmul", False], |
| ], |
| inputs=[cuda_input, kernel_name, simple_mode], |
| label="Demo Kernels (pre-loaded with intentional AMD bugs)", |
| ) |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("### Output", elem_classes="panel-header") |
| log_output = gr.Markdown( |
| value="*Agent steps will appear here once you click **Port to ROCm**.*", |
| label="Agent Pipeline Log", |
| ) |
| hip_output = gr.Code( |
| label="Translated and Optimized HIP Code", |
| language="cpp", |
| lines=18, |
| ) |
|
|
| gr.Markdown( |
| """ |
| --- |
| ### How the pipeline works |
| |
| | Agent | Role | |
| |-------|------| |
| | **Analyzer** | Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout | |
| | **Translator** | Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect | |
| | **Tester** | Verifies compilation with `hipcc` and checks output correctness | |
| | **Optimizer** | Proposes MI300X-specific optimisations; re-tested against baseline | |
| | **Coordinator** | Orchestrates the loop; retries up to 3x if the optimised output regresses | |
| |
| ### The key bug: warp-size assumption |
| |
| ```c |
| // NVIDIA (warpSize = 32) -- silently WRONG on AMD |
| if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... } |
| |
| // AMD-correct (wavefront = 64) |
| if (tid < 64) { |
| vsmem[tid] += vsmem[tid + 32]; |
| if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... } |
| } |
| ``` |
| |
| ### Benchmark highlights (MI300X, ROCm 7.0) |
| |
| | Kernel | Result | |
| |--------|--------| |
| | matrix_multiply 512x512 | 2.91x speedup over baseline HIP | |
| | vector_add 32M elements | ~3918 GB/s (~74% of MI300X peak) | |
| | reduction 16M elements | correctness PASS after wavefront-64 fix | |
| |
| > Source: `docs/benchmark_runs/` -- real `rocprof` CSV output, May 2026. |
| > Results vary with kernel complexity; these figures are not guaranteed on every input. |
| """ |
| ) |
|
|
| port_btn.click( |
| fn=port_kernel, |
| inputs=[cuda_input, kernel_name, simple_mode], |
| outputs=[log_output, hip_output], |
| ) |
|
|
| clear_btn.click( |
| fn=lambda: ("*Agent steps will appear here once you click **Port to ROCm**.*", ""), |
| outputs=[log_output, hip_output], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(theme=gr.themes.Default(primary_hue="orange"), css=CSS) |
|
|