Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	v1
Browse files- Dockerfile +37 -0
- README.md +4 -4
- app.py +92 -0
- requirements.txt +4 -0
- tool/.DS_Store +0 -0
- tool/README.md +0 -0
- tool/__init__.py +0 -0
- tool/input.txt +1 -0
- tool/prompt_a.txt +7 -0
- tool/prompt_b.txt +5 -0
- tool/prompt_c.txt +3 -0
- tool/prompt_e.txt +3 -0
- tool/prompt_f.txt +3 -0
- tool/sample.txt +140 -0
    	
        Dockerfile
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ENV DEBIAN_FRONTEND=noninteractive
         | 
| 4 | 
            +
            RUN apt-get update && \
         | 
| 5 | 
            +
                apt-get install -y --no-install-recommends wget ca-certificates && \
         | 
| 6 | 
            +
                wget -q https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb -O /tmp/nsys.deb && \
         | 
| 7 | 
            +
                apt-get install -y /tmp/nsys.deb && \
         | 
| 8 | 
            +
                rm -f /tmp/nsys.deb && \
         | 
| 9 | 
            +
                apt-get install -y python3.11 python3.11-distutils python3.11-dev python3-pip && \
         | 
| 10 | 
            +
                update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
         | 
| 11 | 
            +
                python -m pip install --upgrade pip && \
         | 
| 12 | 
            +
                rm -rf /var/lib/apt/lists/*
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            RUN apt-get update && apt-get install -y gcc g++ build-essential && \
         | 
| 15 | 
            +
                python -m pip install --no-cache-dir nuitka
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # 可选:创建非 root 用户
         | 
| 18 | 
            +
            RUN useradd -m -u 1000 user
         | 
| 19 | 
            +
            USER user
         | 
| 20 | 
            +
            ENV PATH="/home/user/.local/bin:$PATH"
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            WORKDIR /app
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            # 安装 Python 依赖
         | 
| 25 | 
            +
            COPY --chown=user requirements.txt .
         | 
| 26 | 
            +
            RUN pip install --no-cache-dir -r requirements.txt
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            # 复制源码
         | 
| 29 | 
            +
            COPY --chown=user app.py .
         | 
| 30 | 
            +
            COPY --chown=user tool /app/tool
         | 
| 31 | 
            +
            RUN chown -R user:user /app
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            # Gradio 默认监听 7860
         | 
| 34 | 
            +
            EXPOSE 7860
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            # 直接启动 Python 脚本
         | 
| 37 | 
            +
            CMD ["python", "app.py"]
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
            -
            emoji:  | 
| 4 | 
            -
            colorFrom:  | 
| 5 | 
            -
            colorTo:  | 
| 6 | 
             
            sdk: docker
         | 
| 7 | 
             
            pinned: false
         | 
| 8 | 
             
            license: cc-by-nc-4.0
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: CUDA LLM
         | 
| 3 | 
            +
            emoji: 📈
         | 
| 4 | 
            +
            colorFrom: pink
         | 
| 5 | 
            +
            colorTo: red
         | 
| 6 | 
             
            sdk: docker
         | 
| 7 | 
             
            pinned: false
         | 
| 8 | 
             
            license: cc-by-nc-4.0
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,92 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os, tempfile, time
         | 
| 2 | 
            +
            import gradio as gr
         | 
| 3 | 
            +
            from tool.testv3 import run_autotune_pipeline
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # ---------- Core callback ----------
         | 
| 6 | 
            +
            def generate_kernel(text_input, n_iters, progress=gr.Progress()):
         | 
| 7 | 
            +
                """
         | 
| 8 | 
            +
                text_input : string from textbox (NL description or base CUDA code)
         | 
| 9 | 
            +
                file_input : gr.File upload object (or None)
         | 
| 10 | 
            +
                Returns   : (kernel_code_str, downloadable_file_path)
         | 
| 11 | 
            +
                """
         | 
| 12 | 
            +
                progress((0, n_iters), desc="Initializing...")
         | 
| 13 | 
            +
                # 1) Select input source
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                if not text_input.strip():
         | 
| 16 | 
            +
                    return "⚠️ Please paste a description or baseline CUDA code.", "", None
         | 
| 17 | 
            +
                    
         | 
| 18 | 
            +
                td = tempfile.mkdtemp(prefix="auto_")
         | 
| 19 | 
            +
                src_path = os.path.join(td, f"input_{int(time.time())}.txt")
         | 
| 20 | 
            +
                with open(src_path, "w") as f:
         | 
| 21 | 
            +
                    f.write(text_input)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                best_code = ""
         | 
| 24 | 
            +
                for info in run_autotune_pipeline(src_path, n_iters):
         | 
| 25 | 
            +
                    # 1) update progress bar (if iteration known)
         | 
| 26 | 
            +
                    if info["iteration"] is not None:
         | 
| 27 | 
            +
                        # print(f"Iteration {info['iteration']} / {n_iters}: {info['message']}")
         | 
| 28 | 
            +
                        progress((info["iteration"], n_iters), desc=info["message"])
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    # 3) kernel output only when we get new code
         | 
| 31 | 
            +
                    if info["code"]:
         | 
| 32 | 
            +
                        best_code = info["code"]
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
                # last yield enables the download button
         | 
| 36 | 
            +
                return best_code
         | 
| 37 | 
            +
             | 
| 38 | 
            +
             | 
| 39 | 
            +
            # ---------- Gradio UI ----------
         | 
| 40 | 
            +
            with gr.Blocks(title="KernelPilot", theme=gr.themes.Soft(text_size="lg", font=[
         | 
| 41 | 
            +
                    "system-ui",
         | 
| 42 | 
            +
                    "-apple-system",
         | 
| 43 | 
            +
                    "BlinkMacSystemFont",
         | 
| 44 | 
            +
                    "Segoe UI",
         | 
| 45 | 
            +
                    "Roboto",
         | 
| 46 | 
            +
                    "Helvetica Neue",
         | 
| 47 | 
            +
                    "Arial",
         | 
| 48 | 
            +
                    "Noto Sans",
         | 
| 49 | 
            +
                    "sans-serif"
         | 
| 50 | 
            +
                ])) as demo:
         | 
| 51 | 
            +
                gr.Markdown(
         | 
| 52 | 
            +
                    """# 🚀 KernelPilot  
         | 
| 53 | 
            +
            Enter a natural‑language description ,  
         | 
| 54 | 
            +
            then click **Generate** to obtain the kernel function."""
         | 
| 55 | 
            +
                )
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                with gr.Row():
         | 
| 58 | 
            +
                    txt_input = gr.Textbox(
         | 
| 59 | 
            +
                        label="📝 Input",
         | 
| 60 | 
            +
                        lines=10,
         | 
| 61 | 
            +
                        placeholder="Describe the kernel",
         | 
| 62 | 
            +
                        scale=3
         | 
| 63 | 
            +
                    )
         | 
| 64 | 
            +
                    level = gr.Number(
         | 
| 65 | 
            +
                        label="Optimazation Level",
         | 
| 66 | 
            +
                        minimum=1,
         | 
| 67 | 
            +
                        maximum=5,
         | 
| 68 | 
            +
                        value=5,
         | 
| 69 | 
            +
                        step=1,
         | 
| 70 | 
            +
                        scale=1
         | 
| 71 | 
            +
                    )
         | 
| 72 | 
            +
             | 
| 73 | 
            +
             | 
| 74 | 
            +
                gen_btn = gr.Button("⚡ Generate")
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                kernel_output = gr.Code(
         | 
| 77 | 
            +
                    label="🎯 Tuned CUDA Kernel",
         | 
| 78 | 
            +
                    language="cpp"
         | 
| 79 | 
            +
                )
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                gen_btn.click(
         | 
| 82 | 
            +
                    fn=generate_kernel,
         | 
| 83 | 
            +
                    inputs=[txt_input, level],
         | 
| 84 | 
            +
                    outputs=[kernel_output],
         | 
| 85 | 
            +
                    queue=True,               # keeps requests queued
         | 
| 86 | 
            +
                    show_progress=True,  # show progress bar
         | 
| 87 | 
            +
                    show_progress_on=kernel_output  # update log box with progress
         | 
| 88 | 
            +
                )
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            if __name__ == "__main__":
         | 
| 91 | 
            +
                demo.queue(default_concurrency_limit=1, max_size=50)
         | 
| 92 | 
            +
                demo.launch(server_name="0.0.0.0", server_port=7860)
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,4 @@ | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            gradio>=4.30      # 只要这一行即可
         | 
| 2 | 
            +
            openai>=1.30.5
         | 
| 3 | 
            +
            httpx>=0.27.0
         | 
| 4 | 
            +
            numpy>=1.26.0 
         | 
    	
        tool/.DS_Store
    ADDED
    
    | Binary file (6.15 kB). View file | 
|  | 
    	
        tool/README.md
    ADDED
    
    | 
            File without changes
         | 
    	
        tool/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        tool/input.txt
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            Write a program that transposes a matrix of 32-bit floating point numbers on a GPU. The transpose of a matrix switches its rows and columns. Given a matrix $A$ of dimensions rows $\times$ cols, the transpose $A^T$ will have dimensions cols $\times$ rows. All matrices are stored in row-major format.
         | 
    	
        tool/prompt_a.txt
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            The task is to write a CUDA kernel function on GPU, we have the input described as below:
         | 
| 2 | 
            +
            [input.txt]
         | 
| 3 | 
            +
            And we have also generated the benchmark code for this task: 
         | 
| 4 | 
            +
            [benchmark code]
         | 
| 5 | 
            +
            Optimize the kernel function for less execution time on GPU.
         | 
| 6 | 
            +
            The output should be the content of whole .cu file containing ONE kernel function.
         | 
| 7 | 
            +
            Do not modify the test part. Note the test data contains exactly five input sets. The generated .cu file must ensure that for each input set, the kernel function is called exactly once, resulting in a total of five kernel invocations. Do not include any extra timing logic, profiling wrappers, or repeat kernel calls that could cause each input to trigger multiple kernel launches.
         | 
    	
        tool/prompt_b.txt
    ADDED
    
    | @@ -0,0 +1,5 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Modify the code with the execution error result.
         | 
| 2 | 
            +
            The output should be the content of whole .cu file containing ONE kernel function.
         | 
| 3 | 
            +
            Do not modify the test part. Note the test data contains exactly five input sets. The generated .cu file must ensure that for each input set, the kernel function is called exactly once, resulting in a total of five kernel invocations. Do not include any extra timing logic, profiling wrappers, or repeat kernel calls that could cause each input to trigger multiple kernel launches.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            The execution output is:
         | 
    	
        tool/prompt_c.txt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Optimize the kernel function for less execution time on GPU.
         | 
| 2 | 
            +
            The output should be the content of whole .cu file containing ONE kernel function.
         | 
| 3 | 
            +
            Do not modify the test part. Note the test data contains exactly five input sets. The generated .cu file must ensure that for each input set, the kernel function is called exactly once, resulting in a total of five kernel invocations. Do not include any extra timing logic, profiling wrappers, or repeat kernel calls that could cause each input to trigger multiple kernel launches.
         | 
    	
        tool/prompt_e.txt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            The code failed to launch the kernel. Modify the code.
         | 
| 2 | 
            +
            The output should be the content of whole .cu file containing ONE kernel function.
         | 
| 3 | 
            +
            Do not modify the test part. Note the test data contains exactly five input sets. The generated .cu file must ensure that for each input set, the kernel function is called exactly once, resulting in a total of five kernel invocations. Do not include any extra timing logic, profiling wrappers, or repeat kernel calls that could cause each input to trigger multiple kernel launches.
         | 
    	
        tool/prompt_f.txt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            The result is not the same with the reference output. Modify the code.
         | 
| 2 | 
            +
            The output should be the content of whole .cu file containing ONE kernel function.
         | 
| 3 | 
            +
            Do not modify the test part. Note the test data contains exactly five input sets. The generated .cu file must ensure that for each input set, the kernel function is called exactly once, resulting in a total of five kernel invocations. Do not include any extra timing logic, profiling wrappers, or repeat kernel calls that could cause each input to trigger multiple kernel launches.
         | 
    	
        tool/sample.txt
    ADDED
    
    | @@ -0,0 +1,140 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Given the following task description:
         | 
| 2 | 
            +
            [input.txt]
         | 
| 3 | 
            +
            Write a complete CUDA program (.cu file) that solves this task using a basic and correct algorithm. The implementation should include a kernel function and the main function that tests it.
         | 
| 4 | 
            +
            There are 5 sets of binary input data. The main function should test all five datasets. If all of them pass, the program should print "T", otherwise print "F". The output must strictly be either "T" or "F". Do not write any extra output.
         | 
| 5 | 
            +
            Here is a reference style for the structure of the .cu file:
         | 
| 6 | 
            +
            ```cu
         | 
| 7 | 
            +
            #include <iostream>
         | 
| 8 | 
            +
            #include <cuda_runtime.h>
         | 
| 9 | 
            +
            #include <device_launch_parameters.h>
         | 
| 10 | 
            +
            #include <cmath>
         | 
| 11 | 
            +
            #include <fstream>
         | 
| 12 | 
            +
            #include <vector>
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            #define C 10    
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            __global__ void cross_entropy_kernel(const float* logits, const int* labels, float* loss_sum, int N) {
         | 
| 17 | 
            +
                int j = blockIdx.x * blockDim.x + threadIdx.x; 
         | 
| 18 | 
            +
                if (j >= N) return;
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                float max_logit = -1e20f;
         | 
| 21 | 
            +
                for (int k = 0; k < C; ++k) {
         | 
| 22 | 
            +
                    float z = logits[j * C + k];
         | 
| 23 | 
            +
                    if (z > max_logit) max_logit = z;
         | 
| 24 | 
            +
                }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                float exp_sum = 0.0f;
         | 
| 27 | 
            +
                for (int k = 0; k < C; ++k) {
         | 
| 28 | 
            +
                    exp_sum += expf(logits[j * C + k] - max_logit);
         | 
| 29 | 
            +
                }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                float log_softmax_sum = logf(exp_sum);
         | 
| 32 | 
            +
                float loss_j = log_softmax_sum + max_logit - logits[j * C + labels[j]];
         | 
| 33 | 
            +
                atomicAdd(loss_sum, loss_j / N); 
         | 
| 34 | 
            +
            }
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            void read_binary_float(const std::string& filename, float* data, size_t size) {
         | 
| 37 | 
            +
                std::ifstream in(filename, std::ios::binary);
         | 
| 38 | 
            +
                if (!in) {
         | 
| 39 | 
            +
                    std::cerr << "Cannot open: " << filename << std::endl;
         | 
| 40 | 
            +
                    exit(1);
         | 
| 41 | 
            +
                }
         | 
| 42 | 
            +
                in.read(reinterpret_cast<char*>(data), size * sizeof(float));
         | 
| 43 | 
            +
                in.close();
         | 
| 44 | 
            +
            }
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            void read_binary_int(const std::string& filename, int* data, size_t size) {
         | 
| 47 | 
            +
                std::ifstream in(filename, std::ios::binary);
         | 
| 48 | 
            +
                if (!in) {
         | 
| 49 | 
            +
                    std::cerr << "Cannot open: " << filename << std::endl;
         | 
| 50 | 
            +
                    exit(1);
         | 
| 51 | 
            +
                }
         | 
| 52 | 
            +
                in.read(reinterpret_cast<char*>(data), size * sizeof(int));
         | 
| 53 | 
            +
                in.close();
         | 
| 54 | 
            +
            }
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            // test
         | 
| 57 | 
            +
            bool compare_scalar(float a, float b, float tol = 1e-2f) {
         | 
| 58 | 
            +
                return fabs(a - b) < tol;
         | 
| 59 | 
            +
            }
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            int main() {
         | 
| 62 | 
            +
                std::vector<size_t> Ns = {1<<14, 1<<16, 1<<18, 1<<20, 1<<22};
         | 
| 63 | 
            +
                bool all_match = true;
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                for (int idx = 0; idx < Ns.size(); ++idx) {
         | 
| 66 | 
            +
                    size_t N = Ns[idx];
         | 
| 67 | 
            +
                    size_t logits_size = N * C;
         | 
| 68 | 
            +
                    size_t logits_bytes = logits_size * sizeof(float);
         | 
| 69 | 
            +
                    size_t labels_bytes = N * sizeof(int);
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                    // test
         | 
| 72 | 
            +
                    std::string logits_file = "data/ce_logits_" + std::to_string(idx + 1) + ".bin";
         | 
| 73 | 
            +
                    std::string labels_file = "data/ce_labels_" + std::to_string(idx + 1) + ".bin";
         | 
| 74 | 
            +
                    std::string ref_file    = "data/ce_ref_"    + std::to_string(idx + 1) + ".bin";
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    float* h_logits = (float*)malloc(logits_bytes);
         | 
| 77 | 
            +
                    int* h_labels   = (int*)malloc(labels_bytes);
         | 
| 78 | 
            +
                    float h_ref;
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    read_binary_float(logits_file, h_logits, logits_size);
         | 
| 81 | 
            +
                    read_binary_int(labels_file, h_labels, N);
         | 
| 82 | 
            +
                    read_binary_float(ref_file, &h_ref, 1);
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    float *d_logits, *d_loss;
         | 
| 85 | 
            +
                    int* d_labels;
         | 
| 86 | 
            +
                    cudaMalloc(&d_logits, logits_bytes);
         | 
| 87 | 
            +
                    cudaMalloc(&d_labels, labels_bytes);
         | 
| 88 | 
            +
                    cudaMalloc(&d_loss, sizeof(float));
         | 
| 89 | 
            +
                    cudaMemcpy(d_logits, h_logits, logits_bytes, cudaMemcpyHostToDevice);
         | 
| 90 | 
            +
                    cudaMemcpy(d_labels, h_labels, labels_bytes, cudaMemcpyHostToDevice);
         | 
| 91 | 
            +
                    cudaMemset(d_loss, 0, sizeof(float));
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                    int threads = 256;
         | 
| 94 | 
            +
                    int blocks = (N + threads - 1) / threads;
         | 
| 95 | 
            +
                    cross_entropy_kernel<<<blocks, threads>>>(d_logits, d_labels, d_loss, N);
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                    float h_loss;
         | 
| 98 | 
            +
                    cudaMemcpy(&h_loss, d_loss, sizeof(float), cudaMemcpyDeviceToHost);
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    if (!compare_scalar(h_loss, h_ref)) {
         | 
| 101 | 
            +
                        std::cout << "F" << std::endl;
         | 
| 102 | 
            +
                        all_match = false;
         | 
| 103 | 
            +
                        cudaFree(d_logits); cudaFree(d_labels); cudaFree(d_loss);
         | 
| 104 | 
            +
                        free(h_logits); free(h_labels);
         | 
| 105 | 
            +
                        break;
         | 
| 106 | 
            +
                    }
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    cudaFree(d_logits); cudaFree(d_labels); cudaFree(d_loss);
         | 
| 109 | 
            +
                    free(h_logits); free(h_labels);
         | 
| 110 | 
            +
                }
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                if (all_match) std::cout << "T" << std::endl;
         | 
| 113 | 
            +
                return 0;
         | 
| 114 | 
            +
            }
         | 
| 115 | 
            +
            ```
         | 
| 116 | 
            +
            You also need to write the Python script gen_test_data.py to generate the test binary files. Use the following as a style reference. Make sure all binary files are stored under a folder named data/. Avoid generating extremely large data. Suggested sizes are shown below:
         | 
| 117 | 
            +
            ```py
         | 
| 118 | 
            +
            import numpy as np
         | 
| 119 | 
            +
            import os
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            np.random.seed(30)
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            C = 10
         | 
| 124 | 
            +
            sizes = [2**14, 2**16, 2**18, 2**20, 2**22]
         | 
| 125 | 
            +
             | 
| 126 | 
            +
            for idx, N in enumerate(sizes):
         | 
| 127 | 
            +
                logits = (np.random.randn(N, C) * 3).astype(np.float32)
         | 
| 128 | 
            +
                labels = np.random.randint(0, C, size=N, dtype=np.int32)
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                logits_max = logits.max(axis=1, keepdims=True)
         | 
| 131 | 
            +
                exp_logits = np.exp(logits - logits_max)
         | 
| 132 | 
            +
                log_sum_exp = np.log(exp_logits.sum(axis=1))
         | 
| 133 | 
            +
                losses = log_sum_exp + logits_max[:, 0] - logits[np.arange(N), labels]
         | 
| 134 | 
            +
                avg_loss = losses.mean().astype(np.float32)
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                logits.tofile(f"ce_logits_{idx+1}.bin")
         | 
| 137 | 
            +
                labels.tofile(f"ce_labels_{idx+1}.bin")
         | 
| 138 | 
            +
                np.array([avg_loss], dtype=np.float32).tofile(f"ce_ref_{idx+1}.bin")
         | 
| 139 | 
            +
            ```
         | 
| 140 | 
            +
            You should generate these two code blocks at once.
         | 
