Spaces:

justintchou
/

hardware-accelerators-demo

Sleeping

App Files Files Community

Justin Chou commited on Mar 9

Commit

679abc4

1 Parent(s): e9c2b75

yeah

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +14 -4
demo/app.py +0 -312
demo/readme.md +0 -9
demo/requirements.txt +0 -17
hardware_accelerators/__init__.py +16 -1
hardware_accelerators/analysis/__init__.py +0 -0
hardware_accelerators/analysis/config.py +25 -0
hardware_accelerators/analysis/flow/designs/sky130hd/mydesign/config.mk +10 -0
hardware_accelerators/analysis/flow/designs/sky130hd/mydesign/constraint.sdc +1 -0
hardware_accelerators/analysis/generate.py +958 -0
hardware_accelerators/analysis/hardware_stats.py +458 -0
hardware_accelerators/analysis/mnist_eval.py +274 -0
hardware_accelerators/analysis/simple_circuits.py +258 -0
hardware_accelerators/analysis/verilog_export.py +86 -0
hardware_accelerators/analysis/verilog_output/pipelined_adder_BF16.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_adder_Float16.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_adder_Float32.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_adder_Float8.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_multiplier_BF16.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_multiplier_Float16.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_multiplier_Float32.v +37 -0
hardware_accelerators/analysis/verilog_output/pipelined_multiplier_Float8.v +37 -0
hardware_accelerators/analysis/verilog_output/simple_adder_BF16.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_adder_Float16.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_adder_Float32.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_adder_Float8.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_multiplier_BF16.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_multiplier_Float16.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_multiplier_Float32.v +21 -0
hardware_accelerators/analysis/verilog_output/simple_multiplier_Float8.v +21 -0
hardware_accelerators/app.py +388 -0
hardware_accelerators/compile.py +167 -0
hardware_accelerators/dtypes/__init__.py +3 -1
hardware_accelerators/dtypes/base.py +12 -3
hardware_accelerators/dtypes/bfloat16.py +4 -0
hardware_accelerators/dtypes/float16.py +167 -0
hardware_accelerators/dtypes/float32.py +174 -0
hardware_accelerators/dtypes/float8.py +4 -0
hardware_accelerators/nn/lmul.py +135 -0
hardware_accelerators/nn/precision.py +264 -0
hardware_accelerators/nn/precision_eval.py +280 -0
hardware_accelerators/nn/run_precision_comparison.py +78 -0
hardware_accelerators/nn/train.py +0 -2
hardware_accelerators/nn/util.py +3 -1
hardware_accelerators/rtllib/__init__.py +10 -2
hardware_accelerators/rtllib/accelerator.py +407 -113
hardware_accelerators/rtllib/activations.py +69 -7
hardware_accelerators/rtllib/adders.py +54 -8
hardware_accelerators/rtllib/legacy.py +71 -0
hardware_accelerators/rtllib/lmul.py +63 -60

Dockerfile CHANGED Viewed

@@ -1,3 +1,4 @@
 FROM python:3.12-slim
 WORKDIR /code
@@ -9,20 +10,29 @@ RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
 # Install Python packages
-COPY demo/requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the model and application files
 COPY models/ /code/models/
 COPY hardware_accelerators/ /code/hardware_accelerators/
-COPY demo/app.py /code/app.py
-# Set environment variables for Gradio
 ENV GRADIO_SERVER_NAME=0.0.0.0
 ENV GRADIO_SERVER_PORT=7860
 # Expose the port Gradio runs on
 EXPOSE 7860
 # Command to run the Gradio app
-CMD ["python", "app.py"]

+# Dockerfile for the demo
 FROM python:3.12-slim
 WORKDIR /code
     && rm -rf /var/lib/apt/lists/*
 # Install Python packages
+COPY ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the model and application files
 COPY models/ /code/models/
 COPY hardware_accelerators/ /code/hardware_accelerators/
+COPY results/component_data.csv /code/data/
+# Set environment variables
+ENV HWA_CACHE_DIR=/code/hardware_accelerators/cache
+ENV COMPONENT_DATA_PATH=/code/data/component_data.csv
 ENV GRADIO_SERVER_NAME=0.0.0.0
 ENV GRADIO_SERVER_PORT=7860
+# Copy the component data
+COPY results/component_data.csv /code/data/component_data.csv
+# Compile the simulations
+RUN python3 -m hardware_accelerators.compile
 # Expose the port Gradio runs on
 EXPOSE 7860
 # Command to run the Gradio app
+CMD ["python3", "-m", "hardware_accelerators.app"]

demo/app.py DELETED Viewed

@@ -1,312 +0,0 @@
-import sys
-import gradio as gr
-from gradio.components.image_editor import EditorValue
-import numpy as np
-import torch
-import pandas as pd
-from PIL import Image
-import torch
-import torch.nn as nn
-import torchvision.transforms as transforms
-import tqdm
-import pyrtl
-sys.path.append(".")
-from hardware_accelerators.nn.util import softmax
-from hardware_accelerators.simulation.matrix_utils import (
-    bias_trick,
-    count_total_gemv_tiles,
-    generate_gemv_tiles,
-)
-from hardware_accelerators.rtllib.adders import float_adder
-from hardware_accelerators.rtllib.multipliers import float_multiplier
-from hardware_accelerators.dtypes.bfloat16 import BF16
-from hardware_accelerators.dtypes.float8 import Float8
-from hardware_accelerators.nn import model_factory, get_pytorch_device
-from hardware_accelerators.rtllib import (
-    AcceleratorConfig,
-    Accelerator,
-    lmul_fast,
-    float_multiplier,
-)
-from hardware_accelerators.simulation import CompiledSimulator, AcceleratorSimulator
-# ------------ CONSTANTS ------------ #
-# Load the trained model
-model_path = "models/mlp_mnist.pth"
-model = model_factory()
-model.load_state_dict(
-    torch.load(model_path, map_location=get_pytorch_device(), weights_only=True)
-)
-model.eval()
-classes = [
-    "zero",
-    "one",
-    "two",
-    "three",
-    "four",
-    "five",
-    "six",
-    "seven",
-    "eight",
-    "nine",
-]
-labels_value = {label: 0.0 for label in classes}
-accelerator_dtypes = ["float8", "bfloat16"]
-# accelerator_dtypes = ["float8", "float16", "bfloat16", "float32"]
-dtype_map = {"float8": Float8, "bfloat16": BF16}
-default_config = {
-    "activations_dtype": "bfloat16",
-    "weights_dtype": "bfloat16",
-    "size": 4,
-    "multiplication": "IEEE 754",
-}
-mult_map = {
-    "IEEE 754": float_multiplier,
-    "l-mul": lmul_fast,
-}
-# ------------ Event Listener Functions ------------ #
-def image_to_tensor(sketchpad: EditorValue):
-    image = sketchpad["composite"]
-    image = image.resize((28, 28), Image.Resampling.LANCZOS)  # type: ignore
-    img_array = np.transpose(np.array(image), (2, 0, 1))[-1]
-    # Preprocessing: convert image to tensor and normalize
-    transform = transforms.Compose(
-        [
-            transforms.ToTensor(),
-            transforms.Normalize((0.1307,), (0.3081,)),
-        ]
-    )
-    tensor_image = transform(img_array).unsqueeze(0)  # Add batch dimension
-    return tensor_image
-def torch_predict(sketchpad: EditorValue):
-    tensor_image = image_to_tensor(sketchpad)
-    with torch.no_grad():
-        logits = model(tensor_image)
-        probabilities = torch.softmax(logits, dim=1).squeeze(0)
-    result = {cls: float(prob) for cls, prob in zip(classes, probabilities)}
-    return result
-def update_accelerator_config(
-    activations_dtype: str, weights_dtype: str, size: int, multiplication: str
-) -> AcceleratorConfig:
-    # Triggered by run simulation button
-    print("update_accelerator_config fn called")
-    print(activations_dtype, weights_dtype, size, multiplication)
-    return AcceleratorConfig(
-        num_weight_tiles=4,
-        weight_type=dtype_map[weights_dtype],
-        data_type=dtype_map[activations_dtype],
-        array_size=size,
-        pe_multiplier=mult_map[multiplication],
-        pe_adder=float_adder,
-        accum_adder=float_adder,
-        accum_addr_width=8,
-        accum_type=dtype_map[activations_dtype],
-        pipeline=False,
-    )
-def simulator_predict(sketchpad: EditorValue, config: AcceleratorConfig):
-    # if config == DEFAULT_ACCELERATOR_CONFIG:
-    #     sim = ACCELERATOR_SIM
-    # else:
-    #     sim = AcceleratorSimulator(config=config)
-    sim = CompiledSimulator(config=config)
-    image = image_to_tensor(sketchpad).detach().numpy().flatten()
-    probabilities = sim.run_mlp(model, image)
-    result = {cls: float(prob) for cls, prob in zip(classes, probabilities)}
-    return result
-def sim_predict_progress(
-    sketchpad: EditorValue,
-    config: AcceleratorConfig,
-    gr_progress=gr.Progress(track_tqdm=True),
-):
-    pyrtl.reset_working_block()
-    simulator = CompiledSimulator(config=config)
-    chunk_size = config.array_size
-    x = image_to_tensor(sketchpad).detach().numpy().flatten()
-    probabilities = simulator.run_mlp(model, x)
-    return {cls: float(prob) for cls, prob in zip(classes, probabilities)}
-    weights_1 = model.fc1.weight.numpy(force=True)
-    bias_1 = model.fc1.bias.numpy(force=True)
-    weights_2 = model.fc2.weight.numpy(force=True)
-    bias_2 = model.fc2.bias.numpy(force=True)
-    # Add bias to first layer weights and 1 to activations
-    W_aug, x_aug = bias_trick(weights_1, bias_1, x)
-    total_tiles = count_total_gemv_tiles([(784, 128), (128, 10)], chunk_size)
-    progress = tqdm.tqdm(total=total_tiles)
-    tile_generator = generate_gemv_tiles(x_aug, W_aug, chunk_size)
-    for tile in tile_generator:
-        simulator.load_weights(weights=tile.matrix.T, tile_addr=0)
-        simulator.execute_instruction(
-            load_new_weights=True,
-            weight_tile_addr=0,
-            data_vec=tile.vector,
-            accum_addr=tile.index,
-            accum_mode=not tile.first,
-            activation_func="relu",
-            activation_enable=tile.last,
-            flush_pipeline=True,
-        )
-        progress.update()
-    simulator.execute_instruction(nop=True)
-    simulator.execute_instruction(nop=True)
-    sim_fc1 = np.array(simulator.output_trace)
-    # simulator.reset_output_trace()
-    simulator.output_trace = []
-    W2_aug, fc1_aug = bias_trick(weights_2, bias_2, sim_fc1.flatten())
-    fc2_tile_generator = generate_gemv_tiles(fc1_aug, W2_aug, chunk_size)
-    for tile in fc2_tile_generator:
-        simulator.load_weights(weights=tile.matrix.T, tile_addr=0)
-        simulator.execute_instruction(
-            load_new_weights=True,
-            weight_tile_addr=0,
-            data_vec=tile.vector,
-            accum_addr=tile.index,
-            accum_mode=not tile.first,
-            activation_enable=tile.last,
-            flush_pipeline=True,
-        )
-        progress.update()
-    simulator.execute_instruction(nop=True)
-    simulator.execute_instruction(nop=True)
-    sim_fc2 = np.array(simulator.output_trace).flatten()
-    probabilities = softmax(sim_fc2)
-    result = {cls: float(prob) for cls, prob in zip(classes, probabilities)}
-    return result
-# ------------ Blocks UI Layout ------------ #
-with gr.Blocks(fill_height=False) as demo:
-    accelerator_config = gr.State()
-    gr.Markdown("## Draw a digit to see the model's prediction")
-    with gr.Row(equal_height=True):
-        with gr.Column():
-            sketchpad = gr.Sketchpad(
-                # label="Draw a digit",
-                type="pil",  # Changed to PIL
-                transforms=(),
-                layers=False,
-                canvas_size=(400, 400),
-            )
-            with gr.Row():
-                predict_btn = gr.Button("Run Hardware Simulation", variant="primary")
-            # with gr.Accordion("Accelerator Configuration", open=True):
-            with gr.Group():
-                weight_dtype_component = gr.Radio(
-                    label="Weights d-type",
-                    choices=accelerator_dtypes,
-                    value=default_config["weights_dtype"],
-                    interactive=True,
-                )
-                activation_dtype_component = gr.Radio(
-                    label="Activations d-type",
-                    choices=accelerator_dtypes,
-                    value=default_config["activations_dtype"],
-                    interactive=True,
-                )
-                systolic_array_size_component = gr.Slider(
-                    label="Systolic Array Size",
-                    info="Large values will significantly slow down the simulation",
-                    minimum=2,
-                    maximum=16,
-                    step=1,
-                    value=default_config["size"],
-                    interactive=True,
-                )
-                multiply_component = gr.Radio(
-                    label="Multiplication Type",
-                    choices=["IEEE 754", "l-mul"],
-                    value=default_config["multiplication"],
-                    interactive=True,
-                )
-        with gr.Column():
-            pytorch_output = gr.Label(
-                label="Pytorch Ground Truth Predictions", value=labels_value
-            )
-            sim_output = gr.Label(
-                label="Hardware Simulator Predictions", value=labels_value
-            )
-    # ------------ Event Listeners ------------ #
-    sketchpad.input(
-        fn=torch_predict,
-        inputs=sketchpad,
-        outputs=pytorch_output,
-    )
-    # TODO: implement simulator_predict
-    predict_btn.click(
-        fn=update_accelerator_config,
-        inputs=[
-            activation_dtype_component,
-            weight_dtype_component,
-            systolic_array_size_component,
-            multiply_component,
-        ],
-        outputs=accelerator_config,
-    ).then(
-        fn=sim_predict_progress,
-        inputs=[sketchpad, accelerator_config],
-        outputs=sim_output,
-    )
-    # gr.on(
-    #     fn=update_accelerator_config,
-    #     inputs=[
-    #         activation_dtype_component,
-    #         weight_dtype_component,
-    #         systolic_array_size_component,
-    #         multiply_component,
-    #     ],
-    #     outputs=accelerator_config,
-    # )
-    # ------------
-if __name__ == "__main__":
-    demo.queue()
-    demo.launch(share=False)

demo/readme.md DELETED Viewed

@@ -1,9 +0,0 @@
-# Interactive Demo
-This directory contains a demo where you can test out and compare the performance of the hardware accelerator with a software implementation of the same model. The demo is built using [Gradio](https://gradio.app/), a Python library for creating interactive web applications.
-## Running the Demo
-Install the project requirements, then from the repo root simply run `python demo/app.py` to start the demo.
-We will also provide a ready to go Docker image soon!

demo/requirements.txt DELETED Viewed

@@ -1,17 +0,0 @@
-jupyter==1.1.1
-ipykernel==6.29.5
-tqdm==4.67.0
-numpy==2.2.1
-ipython==8.12.3
-isort==5.13.2
-numpy==2.2.1
-pandas==2.2.3
-pyrtl==0.11.2
-matplotlib==3.10.0
-pytest==8.3.4
-torch==2.4.1
-torchvision==0.19.1
-onnx==1.17.0
-netron==8.1.3
-gradio==5.16.0
-black[jupyter]==24.10.0

hardware_accelerators/__init__.py CHANGED Viewed

@@ -1,4 +1,8 @@
-from .dtypes import BF16, Float8
 from .rtllib import (
     FloatAdderPipelined,
     FloatMultiplierPipelined,
@@ -8,10 +12,21 @@ from .rtllib import (
     lmul_fast,
     lmul_simple,
 )
 __all__ = [
     "Float8",
     "BF16",
     "float_adder",
     "FloatAdderPipelined",
     "float_multiplier",

+from dotenv import load_dotenv
+load_dotenv()
+from .dtypes import BF16, Float8, Float16, Float32
 from .rtllib import (
     FloatAdderPipelined,
     FloatMultiplierPipelined,
     lmul_fast,
     lmul_simple,
 )
+from .simulation import (
+    get_sim_cache_dir,
+    set_sim_cache_dir,
+    CompiledAcceleratorSimulator,
+)
 __all__ = [
+    "get_sim_cache_dir",
+    "set_sim_cache_dir",
+    "CompiledAcceleratorSimulator",
     "Float8",
     "BF16",
+    "Float16",
+    "Float32",
     "float_adder",
     "FloatAdderPipelined",
     "float_multiplier",

hardware_accelerators/analysis/__init__.py ADDED Viewed

File without changes

hardware_accelerators/analysis/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ..dtypes import *
+from ..rtllib.lmul import lmul_fast, lmul_simple
+from ..rtllib.multipliers import float_multiplier
+NN_TEST_BATCH_SIZE = 64
+NN_TEST_SYSTOLIC_ARRAY_SIZE = 8
+NN_TEST_ACCUM_ADDR_WIDTH = 12
+NN_TEST_MUL_FNS = [
+    float_multiplier,
+    lmul_simple,
+    # lmul_fast,
+]
+NN_TEST_WA_DTYPES = [
+    # (Float8, Float8),
+    (Float8, BF16),
+    (Float8, Float32),
+    (BF16, BF16),
+    # (BF16, Float32),
+    # (Float32, Float32),
+]

hardware_accelerators/analysis/flow/designs/sky130hd/mydesign/config.mk ADDED Viewed

	@@ -0,0 +1,10 @@

+export DESIGN_NAME = lmul_pipelined_fast
+export PLATFORM    = nangate45
+export VERILOG_FILES = $(DESIGN_DIR)/src/lmul_pipelined_fast.v
+export SDC_FILE      = $(DESIGN_DIR)/constraint.sdc
+# These values must be multiples of placement site
+export DIE_AREA    = 0 0 100 100
+export CORE_AREA   = 10 10 90 90
+export CLOCK_PERIOD = 1.0

hardware_accelerators/analysis/flow/designs/sky130hd/mydesign/constraint.sdc ADDED Viewed

	@@ -0,0 +1 @@


1	+ create_clock -name clk -period 1.0 [get_ports {clk}]

hardware_accelerators/analysis/generate.py ADDED Viewed

	@@ -0,0 +1,958 @@

+from pathlib import Path
+from h11 import Data
+from pandas import DataFrame
+import pyrtl
+from itertools import product
+from pyrtl import *
+from dataclasses import dataclass
+from typing import Callable, Type, Literal, Optional
+from .verilog_export import export_to_verilog
+from ..dtypes import *
+from ..rtllib import *
+from ..rtllib.processing_element import ProcessingElement
+from ..rtllib.adders import *
+from ..rtllib.multipliers import *
+from ..rtllib.lmul import *
+from ..rtllib.utils.common import *
+from ..simulation.utils import *
+def create_inputs(**named_bitwidths):
+    """
+    Create PyRTL Input wires with specified bitwidths.
+    Args:
+        **named_bitwidths: Named bitwidths where the key is used as the wire name
+    Returns:
+        Generator of PyRTL Input wires
+    Note:
+        You must use all keyword arguments
+    """
+    # If using keyword arguments
+    for name, bitwidth in named_bitwidths.items():
+        yield pyrtl.Input(bitwidth, name=name)  # type: ignore
+def create_outputs(*args, **named_wires):
+    """
+    Create PyRTL Output wires connected to the input wires.
+    Args:
+        *args: Variable number of wires to connect to unnamed outputs
+        **named_wires: Named wires where the key is used as the output wire name
+    Note:
+        You must use either all positional arguments or all keyword arguments, not a mix.
+    """
+    if args and named_wires:
+        raise ValueError(
+            "Please use either all positional arguments or all keyword arguments, not a mix."
+        )
+    # If using positional arguments
+    for wire in args:
+        out = pyrtl.Output(len(wire), name=wire.name.replace("tmp", "out"))  # type: ignore
+        out <<= wire
+    # If using keyword arguments
+    for name, wire in named_wires.items():
+        out = pyrtl.Output(len(wire), name=name)  # type: ignore
+        out <<= wire
+@dataclass
+class RTLAnalysis:
+    """Results of RTL analysis."""
+    max_delay: float
+    max_freq: float
+    logic_area: float
+    mem_area: float
+    name: Optional[str] = None
+    def __repr__(self):
+        if self.name is None:
+            return (
+                f"RTLAnalysisResults("
+                f"max_delay={self.max_delay:.2f} ps, "
+                f"max_freq={self.max_freq:.2f} MHz, "
+                f"logic_area={self.logic_area:.2f}um², "
+                f"mem_area={self.mem_area:.2f}um²)"
+            )
+        else:
+            return (
+                f"RTLAnalysisResults for {self.name}:\n\t"
+                f"max_delay={self.max_delay:.2f} ps\n\t"
+                f"max_freq={self.max_freq:.2f} MHz\n\t"
+                f"logic_area={self.logic_area:.2f}um²\n\t"
+                f"mem_area={self.mem_area:.2f}um²"
+            )
+def analyze(
+    block: Block | None = None, synth: bool = True, opt: bool = True, name=None
+):
+    if block is not None:
+        pyrtl.set_working_block(block)
+    if synth:
+        pyrtl.synthesize()
+    if opt:
+        pyrtl.optimize()
+    timing = pyrtl.TimingAnalysis()
+    max_delay = timing.max_length()
+    max_freq = timing.max_freq()
+    logic_area, mem_area = pyrtl.area_estimation()
+    return RTLAnalysis(
+        name=name,
+        max_delay=max_delay,
+        max_freq=max_freq,
+        logic_area=logic_area * 1e6,
+        mem_area=mem_area * 1e6,
+    )
+def create_adder_blocks(dtype: Type[BaseFloat]) -> dict[str, Block]:
+    bits = dtype.bitwidth()
+    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
+    combinational_block = pyrtl.Block()
+    combinational_fast_block = pyrtl.Block()
+    adder_pipelined_block = pyrtl.Block()
+    adder_pipelined_fast_block = pyrtl.Block()
+    stage_2_block = pyrtl.Block()
+    stage_2_fast_block = pyrtl.Block()
+    stage_3_block = pyrtl.Block()
+    stage_4_block = pyrtl.Block()
+    stage_4_fast_block = pyrtl.Block()
+    stage_5_block = pyrtl.Block()
+    # Combinational design
+    with set_working_block(combinational_block):
+        create_outputs(
+            *float_adder(
+                *create_inputs(float_a=bits, float_b=bits), dtype=dtype, fast=False
+            )
+        )
+    with set_working_block(combinational_fast_block):
+        create_outputs(
+            *float_adder(
+                *create_inputs(float_a=bits, float_b=bits), dtype=dtype, fast=True
+            )
+        )
+    # Complete pipelined design
+    with set_working_block(adder_pipelined_block):
+        create_outputs(
+            float_adder_pipelined(
+                *create_inputs(float_a=bits, float_b=bits),
+                dtype=dtype,
+                fast=False,
+            )
+        )
+    with set_working_block(adder_pipelined_fast_block):
+        create_outputs(
+            float_adder_pipelined(
+                *create_inputs(float_a=bits, float_b=bits),
+                dtype=dtype,
+                fast=True,
+            )
+        )
+    # Stages 1 & 2
+    with set_working_block(stage_2_block):
+        float_components = extract_float_components(
+            *create_inputs(float_a=bits, float_b=bits),
+            e_bits=e_bits,
+            m_bits=m_bits,
+        )
+        stage_2_outputs = adder_stage_2(
+            *float_components,
+            e_bits,
+            m_bits,
+            fast=False,
+        )
+        create_outputs(*stage_2_outputs)
+    with set_working_block(stage_2_fast_block):
+        float_components = extract_float_components(
+            *create_inputs(float_a=bits, float_b=bits),
+            e_bits=e_bits,
+            m_bits=m_bits,
+        )
+        stage_2_outputs = adder_stage_2(
+            *float_components,
+            e_bits,
+            m_bits,
+            fast=True,
+        )
+        create_outputs(*stage_2_outputs)
+    # Stage 3
+    with set_working_block(stage_3_block):
+        # Perform alignment and generate SGR bits
+        stage_3_outputs = adder_stage_3(
+            *create_inputs(mant_smaller=m_bits + 1, shift_amount=e_bits),
+            e_bits=e_bits,
+            m_bits=m_bits,
+        )
+        create_outputs(*stage_3_outputs)
+    # Stage 4
+    with set_working_block(stage_4_block):
+        # Perform mantissa addition and leading zero detection
+        stage_4_outputs = adder_stage_4(
+            *create_inputs(mant_aligned=m_bits + 1, mant_unchanged=m_bits + 1, s_xor=1),
+            m_bits=m_bits,
+            fast=False,
+        )
+        create_outputs(*stage_4_outputs)
+    with set_working_block(stage_4_fast_block):
+        # Perform mantissa addition and leading zero detection
+        stage_4_outputs = adder_stage_4(
+            *create_inputs(mant_aligned=m_bits + 1, mant_unchanged=m_bits + 1, s_xor=1),
+            m_bits=m_bits,
+            fast=True,
+        )
+        create_outputs(*stage_4_outputs)
+    # Stage 5
+    with set_working_block(stage_5_block):
+        # Perform normalization, rounding, and final assembly
+        stage_5_outputs = adder_stage_5(
+            *create_inputs(
+                abs_mantissa=m_bits + 2,
+                sticky_bit=1,
+                guard_bit=1,
+                round_bit=1,
+                lzc=4,
+                exp_larger=e_bits,
+                sign_a=1,
+                sign_b=1,
+                exp_diff=e_bits + 1,
+                is_neg=1,
+            ),
+            e_bits=e_bits,
+            m_bits=m_bits,
+        )
+        create_outputs(*stage_5_outputs)
+    # Return all the generated blocks for analysis
+    return {
+        "adder_combinational": combinational_block,
+        "adder_combinational_fast": combinational_fast_block,
+        "adder_pipelined": adder_pipelined_block,
+        "adder_pipelined_fast": adder_pipelined_fast_block,
+        "adder_stage_2": stage_2_block,
+        "adder_stage_2_fast": stage_2_fast_block,
+        "adder_stage_3": stage_3_block,
+        "adder_stage_4": stage_4_block,
+        "adder_stage_4_fast": stage_4_fast_block,
+        "adder_stage_5": stage_5_block,
+    }
+def create_multiplier_blocks(dtype: Type[BaseFloat], fast: bool) -> dict[str, Block]:
+    bits = dtype.bitwidth()
+    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
+    combinational_block = pyrtl.Block()
+    multiplier_block = pyrtl.Block()
+    stage_2_block = pyrtl.Block()
+    stage_3_block = pyrtl.Block()
+    stage_4_block = pyrtl.Block()
+    # Combinational design
+    with set_working_block(combinational_block):
+        create_outputs(
+            float_multiplier(
+                *create_inputs(float_a=bits, float_b=bits), dtype=dtype, fast=fast
+            )
+        )
+    # Complete pipelined design
+    with set_working_block(multiplier_block):
+        multiplier = FloatMultiplierPipelined(
+            *create_inputs(float_a=bits, float_b=bits), dtype=dtype, fast=fast
+        )
+        create_outputs(multiplier._result)
+    # Stage 1 & 2: Extract components and calculate sign, exponent sum, mantissa product
+    with set_working_block(stage_2_block):
+        float_components = extract_float_components(
+            *create_inputs(float_a=bits, float_b=bits),
+            e_bits=e_bits,
+            m_bits=m_bits,
+        )
+        stage_2_outputs = multiplier_stage_2(
+            *float_components,
+            m_bits,
+            fast,
+        )
+        create_outputs(*stage_2_outputs)
+    # Stage 3: Leading zero detection and exponent adjustment
+    with set_working_block(stage_3_block):
+        stage_3_outputs = multiplier_stage_3(
+            *create_inputs(exp_sum=e_bits + 1, mant_product=2 * m_bits + 2),
+            e_bits=e_bits,
+            m_bits=m_bits,
+            fast=fast,
+        )
+        create_outputs(*stage_3_outputs)
+    # Stage 4: Normalization, rounding, and final assembly
+    with set_working_block(stage_4_block):
+        stage_4_outputs = multiplier_stage_4(
+            *create_inputs(
+                unbiased_exp=e_bits,
+                leading_zeros=e_bits,
+                mantissa_product=2 * m_bits + 2,
+            ),
+            m_bits=m_bits,
+            e_bits=e_bits,
+            fast=fast,
+        )
+        create_outputs(*stage_4_outputs)
+    # Return all the generated blocks for analysis
+    faststr = "_fast" if fast else ""
+    return {
+        f"multiplier_combinational{faststr}": combinational_block,
+        f"multiplier{faststr}": multiplier_block,
+        f"multiplier_stage_2{faststr}": stage_2_block,
+        f"multiplier_stage_3{faststr}": stage_3_block,
+        f"multiplier_stage_4{faststr}": stage_4_block,
+    }
+def create_lmul_blocks(dtype: Type[BaseFloat]) -> dict[str, Block]:
+    bits = dtype.bitwidth()
+    combinational_block = pyrtl.Block()
+    combinational_fast_block = pyrtl.Block()
+    pipelined_block = pyrtl.Block()
+    pipelined_fast_block = pyrtl.Block()
+    # Combinational design (simple)
+    with set_working_block(combinational_block):
+        create_outputs(
+            lmul_simple(*create_inputs(float_a=bits, float_b=bits), dtype=dtype)
+        )
+    # Combinational design (fast)
+    with set_working_block(combinational_fast_block):
+        create_outputs(
+            lmul_fast(*create_inputs(float_a=bits, float_b=bits), dtype=dtype)
+        )
+    # Pipelined design (simple)
+    with set_working_block(pipelined_block):
+        mult = LmulPipelined(
+            *create_inputs(float_a=bits, float_b=bits), dtype=dtype, fast=False
+        )
+        create_outputs(mult.output_reg)
+    # Pipelined design (fast)
+    with set_working_block(pipelined_fast_block):
+        mult = LmulPipelined(
+            *create_inputs(float_a=bits, float_b=bits), dtype=dtype, fast=True
+        )
+        create_outputs(mult.output_reg)
+    # Return all the generated blocks for analysis
+    return {
+        "lmul_combinational_simple": combinational_block,
+        "lmul_combinational_fast": combinational_fast_block,
+        "lmul_pipelined_simple": pipelined_block,
+        "lmul_pipelined_fast": pipelined_fast_block,
+    }
+def connect_pe_io(pe: ProcessingElement):
+    # Connect the inputs and outputs of the processing element
+    w_bits, a_bits = pe.weight_type.bitwidth(), pe.data_type.bitwidth()
+    w_in, d_in, acc_in = create_inputs(
+        weight_in=w_bits, data_in=a_bits, accum_in=a_bits
+    )
+    pe.connect_weight(w_in)
+    pe.connect_data(d_in)
+    pe.connect_accum(acc_in)
+    pe.connect_control_signals(
+        *create_inputs(weight_en=1, data_en=1, mul_en=1, adder_en=1)
+    )
+    create_outputs(*pe.outputs.__dict__.values())
+def create_pe_blocks(
+    dtypes: tuple[Type[BaseFloat], Type[BaseFloat]],
+) -> dict[str, Block]:
+    """Create a processing element for each pair of dtypes."""
+    weight_dtype, act_dtype = dtypes
+    # Defining blocks to encapsulate hardware
+    combinational_block = Block()
+    simple_pipeline_block = Block()
+    simple_pipeline_fast_block = Block()
+    full_pipeline_block = Block()
+    full_pipeline_fast_block = Block()
+    combinational_lmul_block = Block()
+    simple_pipeline_lmul_block = Block()
+    simple_pipeline_fast_lmul_block = Block()
+    full_pipeline_lmul_block = Block()
+    full_pipeline_fast_lmul_block = Block()
+    # Standard IEEE multiplier versions
+    with set_working_block(combinational_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=float_multiplier,
+            adder=float_adder,
+            pipeline_mult=False,
+        )
+        connect_pe_io(pe)
+    with set_working_block(simple_pipeline_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=float_multiplier,
+            adder=float_adder,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    with set_working_block(simple_pipeline_fast_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=float_multiplier_fast_unstable,
+            adder=float_adder_fast_unstable,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    with set_working_block(full_pipeline_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=float_multiplier_pipelined,
+            adder=float_adder_pipelined,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    with set_working_block(full_pipeline_fast_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=float_multiplier_pipelined_fast_unstable,
+            adder=float_adder_pipelined_fast_unstable,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    # L-mul versions
+    with set_working_block(combinational_lmul_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=lmul_simple,
+            adder=float_adder,
+            pipeline_mult=False,
+        )
+        connect_pe_io(pe)
+    with set_working_block(simple_pipeline_lmul_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=lmul_simple,
+            adder=float_adder,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    with set_working_block(simple_pipeline_fast_lmul_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=lmul_fast,
+            adder=float_adder_fast_unstable,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    with set_working_block(full_pipeline_lmul_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=lmul_pipelined,
+            adder=float_adder_pipelined,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    with set_working_block(full_pipeline_fast_lmul_block):
+        pe = ProcessingElement(
+            data_type=act_dtype,
+            weight_type=weight_dtype,
+            accum_type=act_dtype,
+            multiplier=lmul_pipelined_fast,
+            adder=float_adder_pipelined_fast_unstable,
+            pipeline_mult=True,
+        )
+        connect_pe_io(pe)
+    return {
+        "pe_combinational": combinational_block,
+        "pe_standard": simple_pipeline_block,
+        "pe_fast": simple_pipeline_fast_block,
+        "pe_pipelined": full_pipeline_block,
+        "pe_fast_pipelined": full_pipeline_fast_block,
+        "pe_combinational_lmul": combinational_lmul_block,
+        "pe_standard_lmul": simple_pipeline_lmul_block,
+        "pe_fast_lmul": simple_pipeline_fast_lmul_block,
+        "pe_pipelined_lmul": full_pipeline_lmul_block,
+        "pe_fast_pipelined_lmul": full_pipeline_fast_lmul_block,
+    }
+def create_accelerator_blocks(
+    dtypes: tuple[Type[BaseFloat], Type[BaseFloat]],
+    array_size: int = 4,
+    addr_bits: int = 12,
+) -> dict[str, Block]:
+    """
+    Create accelerator blocks for all valid configurations based on the given inputs.
+    Args:
+        dtypes: Tuple of (weight_type, activation_type) data types
+        array_size: Size of the systolic array (N x N)
+        addr_bits: Bit width for accumulator address (uses default if None)
+    Returns:
+        Dictionary mapping configuration names to PyRTL blocks
+    """
+    weight_type, activation_type = dtypes
+    # Define all valid configurations to test
+    pipeline_options = [None, "low", "high"]
+    lmul_options = [False, True]
+    fast_options = [False, True]
+    # Create configs and blocks
+    blocks = {}
+    for pipeline, lmul, fast in product(pipeline_options, lmul_options, fast_options):
+        if pipeline is None and fast is True:
+            continue
+        # Create the configuration
+        config = AcceleratorAnalysisConfig(
+            array_size=array_size,
+            activation_type=activation_type,
+            weight_type=weight_type,
+            lmul=lmul,
+            accum_addr_width=addr_bits,
+            pipeline_level=pipeline,
+            use_fast_internals=fast,
+        )
+        block = pyrtl.Block()
+        with set_working_block(block):
+            AcceleratorTopLevel(config)
+        blocks[config.name] = block
+    return blocks
+################################################################
+# if __name__ == "__main__":
+#     OUTPUT_DIR = Path("verilog")
+#     POSTSYNTH_DIR = OUTPUT_DIR / "pyrtl_synth"
+#     EXPORT_PRE_SYNTH = False
+#     EXPORT_POST_SYNTH = True
+#     RUN_ANALYSIS = True
+#     ANALYSIS_RESULT_DIR = Path("results")
+#     array_size = 8
+#     addr_bits = 12
+#     dtype_list = [Float8, BF16, Float32]
+#     dtype_names = {Float8: "fp8", BF16: "bf16", Float32: "fp32"}
+#     weight_act_dtypes = [
+#         (Float8, Float8),
+#         (Float8, BF16),
+#         (Float8, Float32),
+#         (BF16, BF16),
+#         (BF16, Float32),
+#         (Float32, Float32),
+#     ]
+#     # Hardware building blocks
+#     basic_component_analysis = []
+#     for dtype in dtype_list:
+#         block_dicts = [
+#             ("adder", create_adder_blocks(dtype)),
+#             ("multiplier", create_multiplier_blocks(dtype, fast=False)),
+#             ("multiplier", create_multiplier_blocks(dtype, fast=True)),
+#             ("lmul", create_lmul_blocks(dtype)),
+#         ]
+#         for component_name, block_dict in block_dicts:
+#             for name, block in block_dict.items():
+#                 output_path = Path(component_name, dtype_names[dtype], f"{name}.v")
+#                 if EXPORT_PRE_SYNTH:
+#                     export_to_verilog(block, OUTPUT_DIR / output_path)
+#                 if RUN_ANALYSIS:
+#                     analysis_result = analyze(block, name=name)
+#                     analysis_result.dtype = dtype_names[dtype]
+#                     analysis_result.component = component_name
+#                     basic_component_analysis.append(analysis_result.__dict__)
+#                     if EXPORT_POST_SYNTH:
+#                         export_to_verilog(block, POSTSYNTH_DIR / output_path)
+#     # More complex hardware
+#     pe_analysis = []
+#     accelerator_analysis = []
+#     for weight_dtype, act_dtype in weight_act_dtypes:
+#         folder_name = f"w{weight_dtype.bitwidth()}a{act_dtype.bitwidth()}"
+#         pe_blocks = create_pe_blocks((weight_dtype, act_dtype))
+#         for name, block in pe_blocks.items():
+#             pe_output_path = Path("pe", folder_name, f"{name}.v")
+#             if EXPORT_PRE_SYNTH:
+#                 export_to_verilog(block, OUTPUT_DIR / pe_output_path)
+#             if RUN_ANALYSIS:
+#                 analysis_result = analyze(block, name=name)
+#                 analysis_result.weights = dtype_names[weight_dtype]
+#                 analysis_result.activations = dtype_names[act_dtype]
+#                 analysis_result.component = "pe"
+#                 pe_analysis.append(analysis_result.__dict__)
+#                 if EXPORT_POST_SYNTH:
+#                     export_to_verilog(block, POSTSYNTH_DIR / pe_output_path)
+#         accelerator_blocks = create_accelerator_blocks(
+#             (weight_dtype, act_dtype), array_size, addr_bits
+#         )
+#         for name, block in accelerator_blocks.items():
+#             accelerator_output_path = Path("accelerator", folder_name, f"{name}.v")
+#             if EXPORT_PRE_SYNTH:
+#                 export_to_verilog(block, OUTPUT_DIR / accelerator_output_path)
+#             if RUN_ANALYSIS:
+#                 analysis_result = analyze(block, name=name)
+#                 analysis_result.weights = dtype_names[weight_dtype]
+#                 analysis_result.activations = dtype_names[act_dtype]
+#                 analysis_result.component = "accelerator"
+#                 accelerator_analysis.append(analysis_result.__dict__)
+#                 if EXPORT_POST_SYNTH:
+#                     export_to_verilog(block, POSTSYNTH_DIR / accelerator_output_path)
+#     if RUN_ANALYSIS:
+#         DataFrame(basic_component_analysis).to_csv(
+#             ANALYSIS_RESULT_DIR / "component_analysis.csv", index=False
+#         )
+#         DataFrame(pe_analysis).to_csv(
+#             ANALYSIS_RESULT_DIR / "pe_analysis.csv", index=False
+#         )
+#         DataFrame(accelerator_analysis).to_csv(
+#             ANALYSIS_RESULT_DIR / "accelerator_analysis.csv", index=False
+#         )
+import multiprocessing as mp
+from pathlib import Path
+import os
+import csv
+import time
+from functools import partial
+from pandas import DataFrame
+import json
+import traceback
+def process_block(
+    block,
+    name,
+    output_dir,
+    postsynth_dir,
+    export_pre_synth,
+    export_post_synth,
+    run_analysis,
+    analysis_result_dir,
+    component_name=None,
+    dtype=None,
+    weight_dtype=None,
+    act_dtype=None,
+    dtype_names=None,
+    output_path=None,
+):
+    """Process a single block with optional export and analysis"""
+    result = None
+    try:
+        if export_pre_synth and output_path:
+            os.makedirs((output_dir / output_path).parent, exist_ok=True)
+            export_to_verilog(block, output_dir / output_path)
+        if run_analysis:
+            analysis_result = analyze(block, name=name)
+            # Set appropriate attributes based on the component type
+            if component_name and dtype:
+                analysis_result.dtype = dtype_names[dtype]
+                analysis_result.component = component_name
+                result_type = "component"
+            elif weight_dtype and act_dtype:
+                analysis_result.weights = dtype_names[weight_dtype]
+                analysis_result.activations = dtype_names[act_dtype]
+                analysis_result.component = component_name
+                result_type = "pe" if component_name == "pe" else "accelerator"
+            result = (result_type, analysis_result.__dict__)
+            if export_post_synth and output_path:
+                os.makedirs((postsynth_dir / output_path).parent, exist_ok=True)
+                export_to_verilog(block, postsynth_dir / output_path)
+        return result
+    except Exception as e:
+        error_msg = f"Error processing {name}: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        with open(analysis_result_dir / "errors.log", "a") as f:
+            f.write(f"{error_msg}\n{'='*80}\n")
+        return None
+def save_result(result, analysis_result_dir, result_files):
+    """Save a single result to the appropriate CSV file"""
+    if not result:
+        return
+    result_type, data = result
+    # Get the appropriate file path and headers
+    if result_type == "component":
+        file_path = analysis_result_dir / "component_analysis.csv"
+    elif result_type == "pe":
+        file_path = analysis_result_dir / "pe_analysis.csv"
+    elif result_type == "accelerator":
+        file_path = analysis_result_dir / "accelerator_analysis.csv"
+    # Create directory if it doesn't exist
+    os.makedirs(file_path.parent, exist_ok=True)
+    # Check if file exists to determine if we need to write headers
+    file_exists = file_path.exists()
+    # Get headers from the data
+    headers = list(data.keys())
+    # Open file in append mode
+    with open(file_path, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=headers)
+        # Write headers if file doesn't exist
+        if not file_exists:
+            writer.writeheader()
+        # Write the data row
+        writer.writerow(data)
+    # Track that we've written to this file
+    result_files.add(file_path)
+def result_callback(result, analysis_result_dir, result_files):
+    """Callback function for when a process completes"""
+    if result:
+        save_result(result, analysis_result_dir, result_files)
+if __name__ == "__main__":
+    # Create a multiprocessing pool with as many processes as CPU cores
+    pool = mp.Pool(processes=mp.cpu_count())
+    # Set to track which result files we've written to
+    result_files = set()
+    OUTPUT_DIR = Path("verilog")
+    POSTSYNTH_DIR = OUTPUT_DIR / "pyrtl_synth"
+    EXPORT_PRE_SYNTH = False
+    EXPORT_POST_SYNTH = True
+    RUN_ANALYSIS = True
+    ANALYSIS_RESULT_DIR = Path("results")
+    # Create output directories
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    os.makedirs(POSTSYNTH_DIR, exist_ok=True)
+    os.makedirs(ANALYSIS_RESULT_DIR, exist_ok=True)
+    array_size = 8
+    addr_bits = 12
+    dtype_list = [Float8, BF16, Float32]
+    dtype_names = {Float8: "fp8", BF16: "bf16", Float32: "fp32"}
+    weight_act_dtypes = [
+        (Float8, Float8),
+        (Float8, BF16),
+        (Float8, Float32),
+        (BF16, BF16),
+        (BF16, Float32),
+        (Float32, Float32),
+    ]
+    # Create a partial function with common arguments
+    process_block_partial = partial(
+        process_block,
+        output_dir=OUTPUT_DIR,
+        postsynth_dir=POSTSYNTH_DIR,
+        export_pre_synth=EXPORT_PRE_SYNTH,
+        export_post_synth=EXPORT_POST_SYNTH,
+        run_analysis=RUN_ANALYSIS,
+        analysis_result_dir=ANALYSIS_RESULT_DIR,
+        dtype_names=dtype_names,
+    )
+    # Create a callback function with common arguments
+    callback = partial(
+        result_callback,
+        analysis_result_dir=ANALYSIS_RESULT_DIR,
+        result_files=result_files,
+    )
+    # Track all submitted tasks
+    tasks = []
+    # Hardware building blocks
+    for dtype in dtype_list:
+        block_dicts = [
+            ("adder", create_adder_blocks(dtype)),
+            ("multiplier", create_multiplier_blocks(dtype, fast=False)),
+            ("multiplier", create_multiplier_blocks(dtype, fast=True)),
+            ("lmul", create_lmul_blocks(dtype)),
+        ]
+        for component_name, block_dict in block_dicts:
+            for name, block in block_dict.items():
+                output_path = Path(component_name, dtype_names[dtype], f"{name}.v")
+                # Submit task to process pool
+                task = pool.apply_async(
+                    process_block_partial,
+                    args=(block, name),
+                    kwds={
+                        "component_name": component_name,
+                        "dtype": dtype,
+                        "output_path": output_path,
+                    },
+                    callback=callback,
+                )
+                tasks.append(task)
+    # More complex hardware
+    for weight_dtype, act_dtype in weight_act_dtypes:
+        folder_name = f"w{weight_dtype.bitwidth()}a{act_dtype.bitwidth()}"
+        # Process PE blocks
+        pe_blocks = create_pe_blocks((weight_dtype, act_dtype))
+        for name, block in pe_blocks.items():
+            pe_output_path = Path("pe", folder_name, f"{name}.v")
+            task = pool.apply_async(
+                process_block_partial,
+                args=(block, name),
+                kwds={
+                    "component_name": "pe",
+                    "weight_dtype": weight_dtype,
+                    "act_dtype": act_dtype,
+                    "output_path": pe_output_path,
+                },
+                callback=callback,
+            )
+            tasks.append(task)
+        # Process accelerator blocks
+        accelerator_blocks = create_accelerator_blocks(
+            (weight_dtype, act_dtype), array_size, addr_bits
+        )
+        for name, block in accelerator_blocks.items():
+            accelerator_output_path = Path("accelerator", folder_name, f"{name}.v")
+            task = pool.apply_async(
+                process_block_partial,
+                args=(block, name),
+                kwds={
+                    "component_name": "accelerator",
+                    "weight_dtype": weight_dtype,
+                    "act_dtype": act_dtype,
+                    "output_path": accelerator_output_path,
+                },
+                callback=callback,
+            )
+            tasks.append(task)
+    # Wait for all tasks to complete
+    try:
+        # Monitor progress
+        total_tasks = len(tasks)
+        completed = 0
+        print(f"Processing {total_tasks} tasks using {mp.cpu_count()} processes")
+        while completed < total_tasks:
+            new_completed = sum(1 for task in tasks if task.ready())
+            if new_completed > completed:
+                completed = new_completed
+                print(
+                    f"Progress: {completed}/{total_tasks} tasks completed ({completed/total_tasks*100:.1f}%)"
+                )
+            time.sleep(1)
+        # Make sure all tasks are properly completed
+        for task in tasks:
+            task.get()  # This will raise any exceptions that occurred in the task
+    except KeyboardInterrupt:
+        print("Process interrupted by user. Partial results have been saved.")
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        print("Partial results have been saved.")
+    finally:
+        # Close the pool
+        pool.close()
+        pool.join()
+        print(f"Results saved to: {', '.join(str(f) for f in result_files)}")

hardware_accelerators/analysis/hardware_stats.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import pandas as pd
+import numpy as np
+from typing import Literal, Dict, Tuple, List, Any, Optional
+# Mapping from UI options to dataframe values
+DTYPE_MAP = {"float8": "fp8", "bfloat16": "bf16", "float32": "fp32"}
+SPEED_MAP = {"Fast": True, "Efficient": False}
+PIPELINE_MAP = {
+    "None": "combinational",
+    "Low": "combinational",
+    "Full": "pipelined",
+    "High": "pipelined",
+}
+def filter_components(
+    df: pd.DataFrame, operation: str, dtype: str, is_fast: bool, architecture: str
+) -> pd.DataFrame:
+    """
+    Filter the dataframe to get components matching the specified criteria.
+    Args:
+        df: DataFrame containing component data
+        operation: Type of operation ('multiplier', 'lmul', 'adder')
+        dtype: Data type ('fp8', 'bf16', 'fp32')
+        is_fast: Whether to use fast components
+        architecture: Architecture type ('combinational', 'pipelined')
+    Returns:
+        Filtered DataFrame
+    """
+    filtered_df = df[
+        (df["operation"] == operation)
+        & (df["dtype"] == dtype)
+        & (df["is_fast"] == is_fast)
+        & (df["architecture"] == architecture)
+    ]
+    if filtered_df.empty:
+        # If no exact match, try without the is_fast constraint
+        filtered_df = df[
+            (df["operation"] == operation)
+            & (df["dtype"] == dtype)
+            & (df["architecture"] == architecture)
+        ]
+    if filtered_df.empty:
+        # If still no match, try without architecture constraint
+        filtered_df = df[(df["operation"] == operation) & (df["dtype"] == dtype)]
+    return filtered_df
+def get_pe_components(
+    df: pd.DataFrame, mult_type: str, dtype: str, is_fast: bool, architecture: str
+) -> Tuple[pd.Series, pd.Series]:
+    """
+    Get the multiplier/lmul and adder components for a PE.
+    Args:
+        df: DataFrame containing component data
+        mult_type: Type of multiplier ('multiplier' or 'lmul')
+        dtype: Data type ('fp8', 'bf16', 'fp32')
+        is_fast: Whether to use fast components
+        architecture: Architecture type ('combinational', 'pipelined')
+    Returns:
+        Tuple of (multiplier, adder) Series
+    """
+    mult_df = filter_components(df, mult_type, dtype, is_fast, architecture)
+    adder_df = filter_components(df, "adder", dtype, is_fast, architecture)
+    if mult_df.empty or adder_df.empty:
+        raise ValueError(
+            f"Could not find components for {mult_type}, {dtype}, {is_fast}, {architecture}"
+        )
+    # Get the first matching component
+    multiplier = mult_df.iloc[0]
+    adder = adder_df.iloc[0]
+    return multiplier, adder
+def calculate_pe_metrics(multiplier: pd.Series, adder: pd.Series) -> Dict[str, float]:
+    """
+    Calculate metrics for a single PE (Processing Element).
+    Args:
+        multiplier: Series containing multiplier component data
+        adder: Series containing adder component data
+    Returns:
+        Dictionary of PE metrics
+    """
+    # Area and power are additive
+    area = multiplier["area"] + adder["area"]
+    power = multiplier["power"] + adder["power"]
+    # Critical path depends on architecture
+    if (
+        multiplier["architecture"] == "combinational"
+        and adder["architecture"] == "combinational"
+    ):
+        delay = max(multiplier["max_arrival_time"], adder["max_arrival_time"])
+        pipeline_stages = 1
+    else:
+        # For pipelined designs, we assume the critical path is the slowest stage
+        delay = max(multiplier["max_arrival_time"], adder["max_arrival_time"])
+        # Estimate pipeline stages
+        if multiplier["architecture"] == "pipelined":
+            mult_stages = 4  # Assumption for pipelined multiplier
+        else:
+            mult_stages = 1
+        if adder["architecture"] == "pipelined":
+            add_stages = 5  # Assumption for pipelined adder
+        else:
+            add_stages = 1
+        pipeline_stages = mult_stages + add_stages - 1  # -1 because they share a stage
+    # Calculate performance metrics
+    clock_freq_ghz = 1.0 / delay  # GHz, assuming delay is in ns
+    ops_per_cycle = 2  # 1 multiply + 1 add = 2 FLOPs per cycle
+    tflops = clock_freq_ghz * ops_per_cycle / 1000  # TFLOPS for a single PE
+    # Efficiency metrics
+    tflops_per_watt = tflops / power
+    tflops_per_mm2 = tflops / (area * 1e-6)  # Convert area to mm²
+    power_density = power / (area * 1e-6)  # W/mm²
+    return {
+        "area_um2": area,
+        "area_mm2": area * 1e-6,
+        "power_w": power,
+        "delay_ns": delay,
+        "clock_freq_ghz": clock_freq_ghz,
+        "pipeline_stages": pipeline_stages,
+        "tflops": tflops,
+        "tflops_per_watt": tflops_per_watt,
+        "tflops_per_mm2": tflops_per_mm2,
+        "power_density": power_density,
+        "energy_per_op_pj": (power / (clock_freq_ghz * ops_per_cycle * 1e9))
+        * 1e12,  # pJ per operation
+    }
+def calculate_array_metrics(
+    pe_metrics: Dict[str, float], array_size: int, num_cores: int
+) -> Dict[str, float]:
+    """
+    Calculate metrics for a systolic array and the entire accelerator.
+    Args:
+        pe_metrics: Dictionary of PE metrics
+        array_size: Size of the systolic array (NxN)
+        num_cores: Number of accelerator cores
+    Returns:
+        Dictionary of array and accelerator metrics
+    """
+    # Number of PEs per array and total
+    pes_per_array = array_size * array_size
+    total_pes = pes_per_array * num_cores
+    # Scale metrics for a single array
+    array_area_mm2 = pe_metrics["area_mm2"] * pes_per_array
+    array_power_w = pe_metrics["power_w"] * pes_per_array
+    # Scale metrics for the entire accelerator
+    total_area_mm2 = array_area_mm2 * num_cores
+    total_power_w = array_power_w * num_cores
+    # Performance scales with the number of PEs
+    array_tflops = pe_metrics["tflops"] * pes_per_array
+    total_tflops = array_tflops * num_cores
+    # Efficiency metrics
+    total_tflops_per_watt = total_tflops / total_power_w
+    total_tflops_per_mm2 = total_tflops / total_area_mm2
+    # Latency calculation
+    # For an NxN array, data takes 2N-1 cycles to flow through
+    # Plus pipeline_stages-1 cycles for the pipeline to fill
+    pipeline_latency_cycles = pe_metrics["pipeline_stages"] - 1
+    array_latency_cycles = 2 * array_size - 1
+    total_latency_cycles = array_latency_cycles + pipeline_latency_cycles
+    # Time per cycle based on clock frequency
+    cycle_time_ns = 1.0 / pe_metrics["clock_freq_ghz"]
+    # Total latency in ns
+    total_latency_ns = total_latency_cycles * cycle_time_ns
+    # Throughput after pipeline is filled (ops per second)
+    throughput_ops_per_second = (
+        pe_metrics["clock_freq_ghz"] * 1e9 * pes_per_array * 2
+    )  # 2 ops per PE per cycle
+    total_throughput_ops_per_second = throughput_ops_per_second * num_cores
+    # Energy per matrix multiplication
+    # Assuming an NxN matrix multiply requires N³ operations
+    ops_per_matmul = array_size**3
+    energy_per_matmul_nj = (
+        (array_power_w / throughput_ops_per_second) * ops_per_matmul * 1e9
+    )  # nJ
+    # Inference metrics (assuming a simple MLP with 3 layers)
+    # Each layer requires a matrix multiplication
+    num_layers = 3
+    inference_ops = ops_per_matmul * num_layers
+    inference_latency_ns = (inference_ops / throughput_ops_per_second) * 1e9
+    inference_energy_uj = (
+        (total_power_w / total_throughput_ops_per_second) * inference_ops * 1e6
+    )  # uJ
+    return {
+        "array_size": array_size,
+        "num_cores": num_cores,
+        "pes_per_array": pes_per_array,
+        "total_pes": total_pes,
+        "clock_freq_ghz": pe_metrics["clock_freq_ghz"],
+        "array_area_mm2": array_area_mm2,
+        "total_area_mm2": total_area_mm2,
+        "array_power_w": array_power_w,
+        "total_power_w": total_power_w,
+        "array_tflops": array_tflops,
+        "total_tflops": total_tflops,
+        "tflops_per_watt": total_tflops_per_watt,
+        "tflops_per_mm2": total_tflops_per_mm2,
+        "power_density_w_mm2": total_power_w / total_area_mm2,
+        "total_latency_cycles": total_latency_cycles,
+        "total_latency_ns": total_latency_ns,
+        "throughput_gops": total_throughput_ops_per_second / 1e9,  # GOPS
+        "energy_per_matmul_nj": energy_per_matmul_nj,
+        "inference_latency_ns": inference_latency_ns,
+        "inference_latency_us": inference_latency_ns / 1e3,  # us
+        "inference_energy_uj": inference_energy_uj,
+        "inferences_per_second": 1e9 / inference_latency_ns,
+        "inferences_per_watt": (1e9 / inference_latency_ns) / total_power_w,
+    }
+def format_metrics_for_display(metrics: Dict[str, float]) -> Dict[str, str]:
+    """
+    Format metrics for display in the Gradio UI.
+    Args:
+        metrics: Dictionary of metrics
+    Returns:
+        Dictionary of formatted metrics
+    """
+    formatted = {}
+    # Format area
+    formatted["Total Chip Area"] = f"{metrics['total_area_mm2']:.2f} mm²"
+    # Format performance
+    formatted["Clock Speed"] = f"{metrics['clock_freq_ghz']:.2f} GHz"
+    formatted["Total Performance"] = f"{metrics['total_tflops']:.2f} TFLOPS"
+    formatted["Performance per Core"] = f"{metrics['array_tflops']:.2f} TFLOPS"
+    formatted["Performance per Watt"] = f"{metrics['tflops_per_watt']:.2f} TFLOPS/W"
+    formatted["Performance per Area"] = f"{metrics['tflops_per_mm2']:.2f} TFLOPS/mm²"
+    # Format power
+    formatted["Total Power"] = f"{metrics['total_power_w']:.2f} W"
+    formatted["Power Density"] = f"{metrics['power_density_w_mm2']:.2f} W/mm²"
+    # Format latency and throughput
+    formatted["Matrix Mult Latency"] = f"{metrics['total_latency_ns']:.2f} ns"
+    formatted["Inference Latency"] = f"{metrics['inference_latency_us']:.2f} µs"
+    formatted["Throughput"] = f"{metrics['throughput_gops']:.2f} GOPS"
+    # Format energy
+    formatted["Energy per Matrix Mult"] = f"{metrics['energy_per_matmul_nj']:.2f} nJ"
+    # formatted["Inference Energy"] = f"{metrics['inference_energy_uj']:.2f} µJ"
+    # formatted["Inferences per Second"] = f"{metrics['inferences_per_second']:.0f}"
+    # formatted["Inferences per Watt"] = f"{metrics['inferences_per_watt']:.0f}"
+    return formatted
+def calculate_hardware_stats(
+    df: pd.DataFrame,
+    activation_type: Literal["float8", "bfloat16", "float32"],
+    weight_type: Literal["float8", "bfloat16", "float32"],
+    systolic_array_size: int,
+    num_accelerator_cores: int,
+    fast_internals: Literal["Fast", "Efficient"],
+    pipeline_level: Literal["None", "Low", "Full"],
+    process_node_size: Optional[Literal["7nm", "45nm", "130nm"]] = None,
+) -> Tuple[Dict[str, str], Dict[str, str]]:
+    """
+    Calculate hardware statistics for both lmul and standard IEEE multiplier configurations.
+    Args:
+        df: DataFrame containing component data
+        activation_type: Type of activations
+        weight_type: Type of weights
+        systolic_array_size: Size of the systolic array
+        num_accelerator_cores: Number of accelerator cores
+        fast_internals: Whether to use fast or efficient components
+        pipeline_level: Level of pipelining
+        process_node_size: Process node size (ignored for now)
+    Returns:
+        Tuple of (lmul_metrics, ieee_metrics) dictionaries
+    """
+    # Map UI options to dataframe values
+    act_dtype = DTYPE_MAP[activation_type]
+    weight_dtype = DTYPE_MAP[weight_type]
+    is_fast = SPEED_MAP[fast_internals]
+    architecture = PIPELINE_MAP[pipeline_level]
+    # For mixed precision, use the larger precision for the PE
+    pe_dtype = (
+        act_dtype
+        if DTYPE_MAP[activation_type] >= DTYPE_MAP[weight_type]
+        else weight_dtype
+    )
+    # Calculate metrics for lmul configuration
+    try:
+        lmul_mult, lmul_adder = get_pe_components(
+            df, "lmul", pe_dtype, is_fast, architecture
+        )
+        lmul_pe_metrics = calculate_pe_metrics(lmul_mult, lmul_adder)
+        lmul_array_metrics = calculate_array_metrics(
+            lmul_pe_metrics, systolic_array_size, num_accelerator_cores
+        )
+        lmul_formatted = format_metrics_for_display(lmul_array_metrics)
+    except ValueError as e:
+        # If lmul components not found, return error message
+        lmul_formatted = {"Error": f"Could not find lmul components: {str(e)}"}
+    # Calculate metrics for standard IEEE multiplier configuration
+    try:
+        ieee_mult, ieee_adder = get_pe_components(
+            df, "multiplier", pe_dtype, is_fast, architecture
+        )
+        ieee_pe_metrics = calculate_pe_metrics(ieee_mult, ieee_adder)
+        ieee_array_metrics = calculate_array_metrics(
+            ieee_pe_metrics, systolic_array_size, num_accelerator_cores
+        )
+        ieee_formatted = format_metrics_for_display(ieee_array_metrics)
+    except ValueError as e:
+        # If IEEE components not found, return error message
+        ieee_formatted = {"Error": f"Could not find IEEE components: {str(e)}"}
+    return lmul_formatted, ieee_formatted
+def calculate_comparison_metrics(
+    lmul_metrics: Dict[str, str], ieee_metrics: Dict[str, str]
+) -> Dict[str, str]:
+    """
+    Calculate comparison metrics between lmul and IEEE configurations.
+    Args:
+        lmul_metrics: Dictionary of lmul metrics
+        ieee_metrics: Dictionary of IEEE metrics
+    Returns:
+        Dictionary of comparison metrics
+    """
+    comparison = {}
+    # Check if there was an error in either calculation
+    if "Error" in lmul_metrics or "Error" in ieee_metrics:
+        return {"Error": "Cannot calculate comparison due to missing components"}
+    # Extract numeric values from formatted strings
+    def extract_number(s):
+        return float(s.split()[0])
+    # Calculate percentage improvements
+    try:
+        # Area improvement (lower is better)
+        lmul_area = extract_number(lmul_metrics["Total Chip Area"])
+        ieee_area = extract_number(ieee_metrics["Total Chip Area"])
+        area_improvement = (1 - lmul_area / ieee_area) * 100
+        comparison["Area Reduction"] = f"{area_improvement:.1f}%"
+        # Performance improvement (higher is better)
+        lmul_perf = extract_number(lmul_metrics["Total Performance"])
+        ieee_perf = extract_number(ieee_metrics["Total Performance"])
+        perf_improvement = (lmul_perf / ieee_perf - 1) * 100
+        comparison["Performance Improvement"] = f"{perf_improvement:.1f}%"
+        # Power efficiency improvement (higher is better)
+        lmul_eff = extract_number(lmul_metrics["Performance per Watt"])
+        ieee_eff = extract_number(ieee_metrics["Performance per Watt"])
+        eff_improvement = (lmul_eff / ieee_eff - 1) * 100
+        comparison["Efficiency Improvement"] = f"{eff_improvement:.1f}%"
+        # Latency improvement (lower is better)
+        lmul_latency = extract_number(lmul_metrics["Inference Latency"])
+        ieee_latency = extract_number(ieee_metrics["Inference Latency"])
+        latency_improvement = (1 - lmul_latency / ieee_latency) * 100
+        comparison["Latency Reduction"] = f"{latency_improvement:.1f}%"
+        # Energy improvement (lower is better)
+        lmul_energy = extract_number(lmul_metrics["Inference Energy"])
+        ieee_energy = extract_number(ieee_metrics["Inference Energy"])
+        energy_improvement = (1 - lmul_energy / ieee_energy) * 100
+        comparison["Energy Reduction"] = f"{energy_improvement:.1f}%"
+    except (ValueError, KeyError) as e:
+        comparison["Error"] = f"Error calculating comparisons: {str(e)}"
+    return comparison
+# Example usage in the Gradio app:
+def update_hardware_stats(
+    df: pd.DataFrame,
+    activation_type: str,
+    weight_type: str,
+    systolic_array_size: int,
+    num_accelerator_cores: int,
+    fast_internals: str,
+    pipeline_level: str,
+    process_node_size: str,
+) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
+    """
+    Update hardware statistics for the Gradio app.
+    Args:
+        df: DataFrame containing component data
+        activation_type: Type of activations
+        weight_type: Type of weights
+        systolic_array_size: Size of the systolic array
+        num_accelerator_cores: Number of accelerator cores
+        fast_internals: Whether to use fast or efficient components
+        pipeline_level: Level of pipelining
+        process_node_size: Process node size
+    Returns:
+        Tuple of (lmul_metrics, ieee_metrics, comparison_metrics) dictionaries
+    """
+    lmul_metrics, ieee_metrics = calculate_hardware_stats(
+        df,
+        activation_type,
+        weight_type,
+        systolic_array_size,
+        num_accelerator_cores,
+        fast_internals,
+        pipeline_level,
+        process_node_size,
+    )
+    comparison_metrics = calculate_comparison_metrics(lmul_metrics, ieee_metrics)
+    return lmul_metrics, ieee_metrics, comparison_metrics

hardware_accelerators/analysis/mnist_eval.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# Evaluation function
+from itertools import product
+import os
+import pandas as pd
+from pyrtl import WireVector
+import torch
+from typing import Callable
+import numpy as np
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+from torch.nn import CrossEntropyLoss
+import multiprocessing as mp
+from tqdm.auto import tqdm
+import time
+import csv
+from pathlib import Path
+import traceback
+from ..simulation.matrix_utils import count_batch_gemm_tiles
+from .config import (
+    NN_TEST_MUL_FNS,
+    NN_TEST_SYSTOLIC_ARRAY_SIZE,
+    NN_TEST_ACCUM_ADDR_WIDTH,
+    NN_TEST_WA_DTYPES,
+    NN_TEST_BATCH_SIZE,
+)
+from hardware_accelerators.dtypes import *
+from hardware_accelerators.simulation.accelerator import CompiledAcceleratorSimulator
+from hardware_accelerators.rtllib.accelerator import CompiledAcceleratorConfig
+from hardware_accelerators.rtllib.multipliers import *
+from hardware_accelerators.nn import load_model
+from ..simulation.accelerator import CompiledAcceleratorSimulator
+def generate_test_configs(
+    weight_act_dtypes: list[tuple[Type[BaseFloat], Type[BaseFloat]]],
+    multiplier_fns: list[
+        Callable[[WireVector, WireVector, type[BaseFloat]], WireVector]
+    ],
+):
+    configs = []
+    for dtypes, mult_fn in product(weight_act_dtypes, multiplier_fns):
+        weight_type, act_type = dtypes
+        config = CompiledAcceleratorConfig(
+            array_size=NN_TEST_SYSTOLIC_ARRAY_SIZE,
+            weight_type=weight_type,
+            activation_type=act_type,
+            multiplier=mult_fn,
+            accum_addr_width=NN_TEST_ACCUM_ADDR_WIDTH,
+        )
+        configs.append(config)
+    return configs
+def evaluate_with_progress(
+    config,
+    dataset,
+    batch_size,
+    criterion=CrossEntropyLoss(),
+    process_id=0,
+):
+    """Evaluate a model with progress tracking for the entire dataset"""
+    # Define a complete result template with default values
+    result = {
+        "config": config.name,
+        "weight_type": config.weight_type.__name__,
+        "activation_type": config.activation_type.__name__,
+        "multiplier": config.multiplier.__name__,
+        "avg_loss": float("nan"),
+        "accuracy": float("nan"),
+        "total_time": 0,
+        "batch_size": batch_size,
+        "total_batches": 0,
+        "total_samples": 0,
+        "samples_per_second": 0,
+        "error": None,  # Will be None for successful runs
+    }
+    try:
+        start_time = time.time()
+        # Load the appropriate model based on weight type
+        if config.weight_type == Float32:
+            model = load_model("./models/mlp_mnist_fp32.pth")
+        else:
+            model = load_model("./models/mlp_mnist_bf16.pth")
+        # Create simulator
+        sim = CompiledAcceleratorSimulator(config, model=model)
+        if not sim.model_loaded:
+            result["error"] = "No model loaded"
+            return result
+        correct = 0
+        total = 0
+        running_loss = 0.0
+        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+        total_batches = len(data_loader)
+        tiles_per_batch = count_batch_gemm_tiles(
+            sim.hidden_dim, sim.input_dim + 1, sim.config.array_size
+        ) + count_batch_gemm_tiles(
+            sim.output_dim, sim.hidden_dim + 1, sim.config.array_size
+        )
+        result["total_batches"] = total_batches
+        # Create a progress bar for this specific simulation
+        desc = f"Config {config.name} ({config.weight_type.__name__}/{config.activation_type.__name__})"
+        with tqdm(
+            total=total_batches * tiles_per_batch,
+            desc=desc,
+            position=process_id + 1,
+            leave=False,
+        ) as pbar:
+            for batch, labels in data_loader:
+                batch_size_actual = batch.shape[0]
+                batch = batch.reshape(batch_size_actual, -1).numpy()
+                # Time the prediction
+                outputs = sim.predict_batch(batch, pbar)
+                loss = criterion(torch.tensor(outputs), labels)
+                running_loss += loss.item()
+                # Get predictions from the maximum value
+                predicted = np.argmax(outputs, axis=1)
+                total += labels.size(0)
+                correct += (predicted == labels).sum().item()
+        end_time = time.time()
+        total_time = end_time - start_time
+        # Update result with actual values
+        result.update(
+            {
+                "avg_loss": running_loss / total_batches,
+                "accuracy": 100.0 * correct / total,
+                "total_time": total_time,
+                "total_samples": total,
+                "samples_per_second": total / total_time,
+            }
+        )
+        return result
+    except Exception as e:
+        error_msg = (
+            f"Error evaluating {config.name}: {str(e)}\n{traceback.format_exc()}"
+        )
+        print(error_msg)
+        result["error"] = str(e)
+        return result
+def save_result(result, output_file):
+    """Save a single result to CSV file"""
+    file_exists = os.path.isfile(output_file)
+    with open(output_file, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=list(result.keys()))
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(result)
+def process_config(config, dataset, batch_size, output_file, process_id):
+    """Process a single configuration and save results"""
+    # Set process name for better monitoring
+    mp.current_process().name = f"Sim-{config.name}"
+    # print(f"Starting evaluation of {config.name}")
+    result = evaluate_with_progress(config, dataset, batch_size, process_id=process_id)
+    # Save result immediately
+    save_result(result, output_file)
+    print(
+        f"Completed evaluation of {config.name}: Accuracy = {result.get('accuracy', 'ERROR'):.2f}%, "
+        f"Time = {result.get('total_time', 0):.2f}s, "
+        f"Speed = {result.get('samples_per_second', 0):.2f} samples/s"
+    )
+    return result
+def main():
+    # Create output directory
+    output_dir = Path("results")
+    output_dir.mkdir(exist_ok=True)
+    output_file = output_dir / "mnist_eval.csv"
+    # Data transformation: convert images to tensor and normalize them
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,)),
+        ]
+    )
+    # Download MNIST test data
+    test_dataset = datasets.MNIST(
+        root="./data", train=False, download=True, transform=transform
+    )
+    configs = generate_test_configs(NN_TEST_WA_DTYPES, NN_TEST_MUL_FNS)
+    # Create a multiprocessing pool
+    num_processes = min(len(configs), mp.cpu_count() - 2)
+    print(f"Using {num_processes} processes to evaluate {len(configs)} configurations")
+    print(
+        f"Each simulation will process the entire test dataset with batch size {NN_TEST_BATCH_SIZE}"
+    )
+    # Clear the screen and set up for progress bars
+    print("\n\n")
+    # Start the pool and process configurations
+    with mp.Pool(processes=num_processes) as pool:
+        # Create a list to track all tasks
+        tasks = []
+        # Submit all tasks
+        for i, config in enumerate(configs):
+            task = pool.apply_async(
+                process_config,
+                args=(config, test_dataset, NN_TEST_BATCH_SIZE, output_file, i),
+            )
+            tasks.append((config.name, task))
+        # Set up the main progress bar at the top
+        with tqdm(total=len(tasks), desc="Overall Progress", position=0) as pbar:
+            completed = 0
+            while completed < len(tasks):
+                new_completed = sum(1 for _, task in tasks if task.ready())
+                if new_completed > completed:
+                    pbar.update(new_completed - completed)
+                    completed = new_completed
+                time.sleep(0.5)
+        # Make sure all tasks are properly completed and collect results
+        all_results = []
+        for config_name, task in tasks:
+            try:
+                result = task.get()
+                all_results.append(result)
+            except Exception as e:
+                print(f"Error in task {config_name}: {str(e)}")
+    print(f"All evaluations complete. Results saved to {output_file}")
+    # Create a summary DataFrame and display it
+    if all_results:
+        df = pd.DataFrame(all_results)
+        print("\nSummary of Results (sorted by accuracy):")
+        summary_cols = [
+            "config",
+            "weight_type",
+            "activation_type",
+            "multiplier",
+            "accuracy",
+            "total_time",
+            "samples_per_second",
+        ]
+        print(df[summary_cols].sort_values("accuracy", ascending=False))
+        print("\nSummary of Results (sorted by speed):")
+        print(df[summary_cols].sort_values("samples_per_second", ascending=False))
+if __name__ == "__main__":
+    # Set start method for multiprocessing
+    mp.set_start_method("spawn", force=True)  # Use 'spawn' for better compatibility
+    main()

hardware_accelerators/analysis/simple_circuits.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import pyrtl
+from pyrtl import WireVector, Input, Output, Simulation
+import numpy as np
+import sys
+import os
+# Add the parent directory to the path so we can import from hardware_accelerators
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+from hardware_accelerators.dtypes import Float16, Float32, Float8, BF16
+def create_simple_adder(data_type):
+    """
+    Create a simple adder circuit (a + b) using PyRTL's built-in operators.
+    Args:
+        data_type: The data type to use (only used for bitwidth)
+    Returns:
+        The PyRTL working block
+    """
+    # Clear any existing PyRTL design
+    pyrtl.reset_working_block()
+    # Create input and output wires
+    a = Input(data_type.bitwidth(), 'a')
+    b = Input(data_type.bitwidth(), 'b')
+    result = Output(data_type.bitwidth(), 'result')
+    # Create adder using PyRTL's built-in addition
+    # Note: This treats the inputs as unsigned integers, not floating point
+    result <<= a + b
+    return pyrtl.working_block()
+def create_simple_multiplier(data_type):
+    """
+    Create a simple multiplier circuit (a * b) using PyRTL's built-in operators.
+    Args:
+        data_type: The data type to use (only used for bitwidth)
+    Returns:
+        The PyRTL working block
+    """
+    # Clear any existing PyRTL design
+    pyrtl.reset_working_block()
+    # Create input and output wires
+    a = Input(data_type.bitwidth(), 'a')
+    b = Input(data_type.bitwidth(), 'b')
+    result = Output(data_type.bitwidth(), 'result')
+    # Create multiplier using PyRTL's built-in multiplication
+    # Note: This treats the inputs as unsigned integers, not floating point
+    # We'll truncate the result to match the input bitwidth
+    mult_result = a * b
+    result <<= mult_result[:data_type.bitwidth()]
+    return pyrtl.working_block()
+def create_pipelined_adder(data_type):
+    """
+    Create a pipelined adder circuit (a + b) using PyRTL's built-in operators.
+    Args:
+        data_type: The data type to use (only used for bitwidth)
+    Returns:
+        The PyRTL working block and the result wire
+    """
+    # Clear any existing PyRTL design
+    pyrtl.reset_working_block()
+    # Create input and output wires
+    a = Input(data_type.bitwidth(), 'a')
+    b = Input(data_type.bitwidth(), 'b')
+    result = Output(data_type.bitwidth(), 'result')
+    # Create pipeline registers
+    a_reg = pyrtl.Register(bitwidth=data_type.bitwidth(), name='a_reg')
+    b_reg = pyrtl.Register(bitwidth=data_type.bitwidth(), name='b_reg')
+    # Connect input to registers
+    a_reg.next <<= a
+    b_reg.next <<= b
+    # Perform addition in the next stage using PyRTL's built-in addition
+    add_result = a_reg + b_reg
+    # Connect to output
+    result <<= add_result
+    return pyrtl.working_block(), result
+def create_pipelined_multiplier(data_type):
+    """
+    Create a pipelined multiplier circuit (a * b) using PyRTL's built-in operators.
+    Args:
+        data_type: The data type to use (only used for bitwidth)
+    Returns:
+        The PyRTL working block and the result wire
+    """
+    # Clear any existing PyRTL design
+    pyrtl.reset_working_block()
+    # Create input and output wires
+    a = Input(data_type.bitwidth(), 'a')
+    b = Input(data_type.bitwidth(), 'b')
+    result = Output(data_type.bitwidth(), 'result')
+    # Create pipeline registers
+    a_reg = pyrtl.Register(bitwidth=data_type.bitwidth(), name='a_reg')
+    b_reg = pyrtl.Register(bitwidth=data_type.bitwidth(), name='b_reg')
+    # Connect input to registers
+    a_reg.next <<= a
+    b_reg.next <<= b
+    # Perform multiplication in the next stage using PyRTL's built-in multiplication
+    mult_result = a_reg * b_reg
+    # Truncate the result to match the input bitwidth
+    truncated_result = mult_result[:data_type.bitwidth()]
+    # Connect to output
+    result <<= truncated_result
+    return pyrtl.working_block(), result
+def simulate_circuit(block, data_type, num_cycles=10):
+    """
+    Simulate a circuit with random inputs.
+    Args:
+        block: The PyRTL block to simulate
+        data_type: The data type used (only for bitwidth)
+        num_cycles: Number of simulation cycles
+    Returns:
+        sim: The simulation object
+        trace: The simulation trace
+    """
+    # Create simulation with tracing enabled
+    sim = Simulation()
+    # Create test data (random integers within the valid range for the bitwidth)
+    bitwidth = data_type.bitwidth()
+    # Handle large bitwidths safely
+    if bitwidth > 30:  # Avoid overflow for large bitwidths
+        a_values = [np.random.randint(0, 2**16) for _ in range(num_cycles)]
+        b_values = [np.random.randint(0, 2**16) for _ in range(num_cycles)]
+    else:
+        max_val = 2**bitwidth - 1
+        a_values = [np.random.randint(0, max_val) for _ in range(num_cycles)]
+        b_values = [np.random.randint(0, max_val) for _ in range(num_cycles)]
+    # Create input dictionaries for each cycle
+    input_vectors = []
+    for i in range(num_cycles):
+        cycle_inputs = {
+            'a': a_values[i],
+            'b': b_values[i]
+        }
+        input_vectors.append(cycle_inputs)
+    # Run simulation for each cycle
+    for i in range(num_cycles):
+        sim.step(input_vectors[i])
+    # Get trace from the tracer
+    tracer = sim.tracer
+    return sim, tracer
+def main():
+    """Main function to create and test the simplified circuits."""
+    # Data types to test (only used for bitwidth)
+    data_types = [Float8, BF16, Float16, Float32]
+    # Results dictionary
+    results = {
+        "simple_adder": {},
+        "simple_multiplier": {},
+        "pipelined_adder": {},
+        "pipelined_multiplier": {}
+    }
+    # Test simple adders
+    print("=== Testing Simple Adders ===")
+    for dtype in data_types:
+        print(f"\nCreating and simulating {dtype.__name__} adder (bitwidth: {dtype.bitwidth()})...")
+        block = create_simple_adder(dtype)
+        sim, tracer = simulate_circuit(block, dtype)
+        # Print some results
+        if 'result' in tracer.trace:
+            output_values = tracer.trace['result']
+            print(f"  result: {output_values}")
+        results["simple_adder"][dtype.__name__] = block
+    # Test simple multipliers
+    print("\n=== Testing Simple Multipliers ===")
+    for dtype in data_types:
+        print(f"\nCreating and simulating {dtype.__name__} multiplier (bitwidth: {dtype.bitwidth()})...")
+        block = create_simple_multiplier(dtype)
+        sim, tracer = simulate_circuit(block, dtype)
+        # Print some results
+        if 'result' in tracer.trace:
+            output_values = tracer.trace['result']
+            print(f"  result: {output_values}")
+        results["simple_multiplier"][dtype.__name__] = block
+    # Test pipelined adders
+    print("\n=== Testing Pipelined Adders ===")
+    for dtype in data_types:
+        print(f"\nCreating and simulating {dtype.__name__} pipelined adder (bitwidth: {dtype.bitwidth()})...")
+        block, _ = create_pipelined_adder(dtype)
+        sim, tracer = simulate_circuit(block, dtype)
+        # Print some results
+        if 'result' in tracer.trace:
+            output_values = tracer.trace['result']
+            print(f"  result: {output_values}")
+        results["pipelined_adder"][dtype.__name__] = block
+    # Test pipelined multipliers
+    print("\n=== Testing Pipelined Multipliers ===")
+    for dtype in data_types:
+        print(f"\nCreating and simulating {dtype.__name__} pipelined multiplier (bitwidth: {dtype.bitwidth()})...")
+        block, _ = create_pipelined_multiplier(dtype)
+        sim, tracer = simulate_circuit(block, dtype)
+        # Print some results
+        if 'result' in tracer.trace:
+            output_values = tracer.trace['result']
+            print(f"  result: {output_values}")
+        results["pipelined_multiplier"][dtype.__name__] = block
+    return results
+if __name__ == "__main__":
+    main()

hardware_accelerators/analysis/verilog_export.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import sys
+import pyrtl
+from pyrtl import WireVector, Input, Output, Simulation
+# Add the parent directory to the path so we can import from hardware_accelerators
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+from hardware_accelerators.dtypes import Float16, Float32, Float8, BF16
+from hardware_accelerators.analysis.simple_circuits import (
+    create_simple_adder,
+    create_simple_multiplier,
+    create_pipelined_adder,
+    create_pipelined_multiplier
+)
+def export_to_verilog(block, output_filename, add_reset=True, initialize_registers=False):
+    """
+    Export a PyRTL block to a Verilog file.
+    Args:
+        block: The PyRTL block to export
+        output_filename: The filename to write the Verilog code to
+        add_reset: If reset logic should be added. Options are:
+                  False (no reset logic),
+                  True (synchronous reset logic),
+                  'asynchronous' (asynchronous reset logic)
+        initialize_registers: Initialize Verilog registers to their reset_value
+    """
+    # Create the output directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+    # Export the block to Verilog
+    with open(output_filename, 'w') as f:
+        pyrtl.output_to_verilog(
+            f,
+            add_reset=add_reset,
+            initialize_registers=initialize_registers,
+            block=block
+        )
+    print(f"Exported Verilog to {output_filename}")
+def export_all_circuits():
+    """
+    Export all simple circuits to Verilog files.
+    """
+    # Create output directory
+    output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "verilog_output")
+    os.makedirs(output_dir, exist_ok=True)
+    # List of data types to use
+    data_types = [Float8, Float16, BF16, Float32]
+    # Export simple adder for each data type
+    for dtype in data_types:
+        block = create_simple_adder(dtype)
+        output_filename = os.path.join(output_dir, f"simple_adder_{dtype.__name__}.v")
+        export_to_verilog(block, output_filename)
+    # Export simple multiplier for each data type
+    for dtype in data_types:
+        block = create_simple_multiplier(dtype)
+        output_filename = os.path.join(output_dir, f"simple_multiplier_{dtype.__name__}.v")
+        export_to_verilog(block, output_filename)
+    # Export pipelined adder for each data type
+    for dtype in data_types:
+        block, _ = create_pipelined_adder(dtype)
+        output_filename = os.path.join(output_dir, f"pipelined_adder_{dtype.__name__}.v")
+        export_to_verilog(block, output_filename, initialize_registers=True)
+    # Export pipelined multiplier for each data type
+    for dtype in data_types:
+        block, _ = create_pipelined_multiplier(dtype)
+        output_filename = os.path.join(output_dir, f"pipelined_multiplier_{dtype.__name__}.v")
+        export_to_verilog(block, output_filename, initialize_registers=True)
+def main():
+    # Export all circuits
+    export_all_circuits()
+    print("All circuits exported to Verilog successfully!")
+if __name__ == "__main__":
+    main()

hardware_accelerators/analysis/verilog_output/pipelined_adder_BF16.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    reg[15:0] a_reg = 16'd0;
+    reg[15:0] b_reg = 16'd0;
+    wire[16:0] tmp20;
+    wire[15:0] tmp21;
+    // Combinational
+    assign result = tmp21;
+    assign tmp20 = a_reg + b_reg;
+    assign tmp21 = {tmp20[15], tmp20[14], tmp20[13], tmp20[12], tmp20[11], tmp20[10], tmp20[9], tmp20[8], tmp20[7], tmp20[6], tmp20[5], tmp20[4], tmp20[3], tmp20[2], tmp20[1], tmp20[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_adder_Float16.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    reg[15:0] a_reg = 16'd0;
+    reg[15:0] b_reg = 16'd0;
+    wire[16:0] tmp18;
+    wire[15:0] tmp19;
+    // Combinational
+    assign result = tmp19;
+    assign tmp18 = a_reg + b_reg;
+    assign tmp19 = {tmp18[15], tmp18[14], tmp18[13], tmp18[12], tmp18[11], tmp18[10], tmp18[9], tmp18[8], tmp18[7], tmp18[6], tmp18[5], tmp18[4], tmp18[3], tmp18[2], tmp18[1], tmp18[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_adder_Float32.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[31:0] a;
+    input[31:0] b;
+    output[31:0] result;
+    reg[31:0] a_reg = 32'd0;
+    reg[31:0] b_reg = 32'd0;
+    wire[32:0] tmp22;
+    wire[31:0] tmp23;
+    // Combinational
+    assign result = tmp23;
+    assign tmp22 = a_reg + b_reg;
+    assign tmp23 = {tmp22[31], tmp22[30], tmp22[29], tmp22[28], tmp22[27], tmp22[26], tmp22[25], tmp22[24], tmp22[23], tmp22[22], tmp22[21], tmp22[20], tmp22[19], tmp22[18], tmp22[17], tmp22[16], tmp22[15], tmp22[14], tmp22[13], tmp22[12], tmp22[11], tmp22[10], tmp22[9], tmp22[8], tmp22[7], tmp22[6], tmp22[5], tmp22[4], tmp22[3], tmp22[2], tmp22[1], tmp22[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_adder_Float8.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[7:0] a;
+    input[7:0] b;
+    output[7:0] result;
+    reg[7:0] a_reg = 8'd0;
+    reg[7:0] b_reg = 8'd0;
+    wire[8:0] tmp16;
+    wire[7:0] tmp17;
+    // Combinational
+    assign result = tmp17;
+    assign tmp16 = a_reg + b_reg;
+    assign tmp17 = {tmp16[7], tmp16[6], tmp16[5], tmp16[4], tmp16[3], tmp16[2], tmp16[1], tmp16[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_multiplier_BF16.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    reg[15:0] a_reg = 16'd0;
+    reg[15:0] b_reg = 16'd0;
+    wire[31:0] tmp28;
+    wire[15:0] tmp29;
+    // Combinational
+    assign result = tmp29;
+    assign tmp28 = a_reg * b_reg;
+    assign tmp29 = {tmp28[15], tmp28[14], tmp28[13], tmp28[12], tmp28[11], tmp28[10], tmp28[9], tmp28[8], tmp28[7], tmp28[6], tmp28[5], tmp28[4], tmp28[3], tmp28[2], tmp28[1], tmp28[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_multiplier_Float16.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    reg[15:0] a_reg = 16'd0;
+    reg[15:0] b_reg = 16'd0;
+    wire[31:0] tmp26;
+    wire[15:0] tmp27;
+    // Combinational
+    assign result = tmp27;
+    assign tmp26 = a_reg * b_reg;
+    assign tmp27 = {tmp26[15], tmp26[14], tmp26[13], tmp26[12], tmp26[11], tmp26[10], tmp26[9], tmp26[8], tmp26[7], tmp26[6], tmp26[5], tmp26[4], tmp26[3], tmp26[2], tmp26[1], tmp26[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_multiplier_Float32.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[31:0] a;
+    input[31:0] b;
+    output[31:0] result;
+    reg[31:0] a_reg = 32'd0;
+    reg[31:0] b_reg = 32'd0;
+    wire[63:0] tmp30;
+    wire[31:0] tmp31;
+    // Combinational
+    assign result = tmp31;
+    assign tmp30 = a_reg * b_reg;
+    assign tmp31 = {tmp30[31], tmp30[30], tmp30[29], tmp30[28], tmp30[27], tmp30[26], tmp30[25], tmp30[24], tmp30[23], tmp30[22], tmp30[21], tmp30[20], tmp30[19], tmp30[18], tmp30[17], tmp30[16], tmp30[15], tmp30[14], tmp30[13], tmp30[12], tmp30[11], tmp30[10], tmp30[9], tmp30[8], tmp30[7], tmp30[6], tmp30[5], tmp30[4], tmp30[3], tmp30[2], tmp30[1], tmp30[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/pipelined_multiplier_Float8.v ADDED Viewed

	@@ -0,0 +1,37 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[7:0] a;
+    input[7:0] b;
+    output[7:0] result;
+    reg[7:0] a_reg = 8'd0;
+    reg[7:0] b_reg = 8'd0;
+    wire[15:0] tmp24;
+    wire[7:0] tmp25;
+    // Combinational
+    assign result = tmp25;
+    assign tmp24 = a_reg * b_reg;
+    assign tmp25 = {tmp24[7], tmp24[6], tmp24[5], tmp24[4], tmp24[3], tmp24[2], tmp24[1], tmp24[0]};
+    // Registers
+    always @(posedge clk)
+    begin
+        if (rst) begin
+            a_reg <= 0;
+            b_reg <= 0;
+        end
+        else begin
+            a_reg <= a;
+            b_reg <= b;
+        end
+    end
+endmodule

hardware_accelerators/analysis/verilog_output/simple_adder_BF16.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    wire[16:0] tmp4;
+    wire[15:0] tmp5;
+    // Combinational
+    assign result = tmp5;
+    assign tmp4 = a + b;
+    assign tmp5 = {tmp4[15], tmp4[14], tmp4[13], tmp4[12], tmp4[11], tmp4[10], tmp4[9], tmp4[8], tmp4[7], tmp4[6], tmp4[5], tmp4[4], tmp4[3], tmp4[2], tmp4[1], tmp4[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_adder_Float16.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    wire[16:0] tmp2;
+    wire[15:0] tmp3;
+    // Combinational
+    assign result = tmp3;
+    assign tmp2 = a + b;
+    assign tmp3 = {tmp2[15], tmp2[14], tmp2[13], tmp2[12], tmp2[11], tmp2[10], tmp2[9], tmp2[8], tmp2[7], tmp2[6], tmp2[5], tmp2[4], tmp2[3], tmp2[2], tmp2[1], tmp2[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_adder_Float32.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[31:0] a;
+    input[31:0] b;
+    output[31:0] result;
+    wire[32:0] tmp6;
+    wire[31:0] tmp7;
+    // Combinational
+    assign result = tmp7;
+    assign tmp6 = a + b;
+    assign tmp7 = {tmp6[31], tmp6[30], tmp6[29], tmp6[28], tmp6[27], tmp6[26], tmp6[25], tmp6[24], tmp6[23], tmp6[22], tmp6[21], tmp6[20], tmp6[19], tmp6[18], tmp6[17], tmp6[16], tmp6[15], tmp6[14], tmp6[13], tmp6[12], tmp6[11], tmp6[10], tmp6[9], tmp6[8], tmp6[7], tmp6[6], tmp6[5], tmp6[4], tmp6[3], tmp6[2], tmp6[1], tmp6[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_adder_Float8.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[7:0] a;
+    input[7:0] b;
+    output[7:0] result;
+    wire[8:0] tmp0;
+    wire[7:0] tmp1;
+    // Combinational
+    assign result = tmp1;
+    assign tmp0 = a + b;
+    assign tmp1 = {tmp0[7], tmp0[6], tmp0[5], tmp0[4], tmp0[3], tmp0[2], tmp0[1], tmp0[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_multiplier_BF16.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    wire[31:0] tmp12;
+    wire[15:0] tmp13;
+    // Combinational
+    assign result = tmp13;
+    assign tmp12 = a * b;
+    assign tmp13 = {tmp12[15], tmp12[14], tmp12[13], tmp12[12], tmp12[11], tmp12[10], tmp12[9], tmp12[8], tmp12[7], tmp12[6], tmp12[5], tmp12[4], tmp12[3], tmp12[2], tmp12[1], tmp12[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_multiplier_Float16.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[15:0] a;
+    input[15:0] b;
+    output[15:0] result;
+    wire[31:0] tmp10;
+    wire[15:0] tmp11;
+    // Combinational
+    assign result = tmp11;
+    assign tmp10 = a * b;
+    assign tmp11 = {tmp10[15], tmp10[14], tmp10[13], tmp10[12], tmp10[11], tmp10[10], tmp10[9], tmp10[8], tmp10[7], tmp10[6], tmp10[5], tmp10[4], tmp10[3], tmp10[2], tmp10[1], tmp10[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_multiplier_Float32.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[31:0] a;
+    input[31:0] b;
+    output[31:0] result;
+    wire[63:0] tmp14;
+    wire[31:0] tmp15;
+    // Combinational
+    assign result = tmp15;
+    assign tmp14 = a * b;
+    assign tmp15 = {tmp14[31], tmp14[30], tmp14[29], tmp14[28], tmp14[27], tmp14[26], tmp14[25], tmp14[24], tmp14[23], tmp14[22], tmp14[21], tmp14[20], tmp14[19], tmp14[18], tmp14[17], tmp14[16], tmp14[15], tmp14[14], tmp14[13], tmp14[12], tmp14[11], tmp14[10], tmp14[9], tmp14[8], tmp14[7], tmp14[6], tmp14[5], tmp14[4], tmp14[3], tmp14[2], tmp14[1], tmp14[0]};
+endmodule

hardware_accelerators/analysis/verilog_output/simple_multiplier_Float8.v ADDED Viewed

	@@ -0,0 +1,21 @@

+// Generated automatically via PyRTL
+// As one initial test of synthesis, map to FPGA with:
+//   yosys -p "synth_xilinx -top toplevel" thisfile.v
+module toplevel(clk, rst, a, b, result);
+    input clk;
+    input rst;
+    input[7:0] a;
+    input[7:0] b;
+    output[7:0] result;
+    wire[15:0] tmp8;
+    wire[7:0] tmp9;
+    // Combinational
+    assign result = tmp9;
+    assign tmp8 = a * b;
+    assign tmp9 = {tmp8[7], tmp8[6], tmp8[5], tmp8[4], tmp8[3], tmp8[2], tmp8[1], tmp8[0]};
+endmodule

hardware_accelerators/app.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import os
+from typing import Literal
+import gradio as gr
+from gradio.components.image_editor import EditorValue
+import numpy as np
+import pandas as pd
+from PIL import Image
+from torchvision import transforms
+from .nn.util import load_model
+from .rtllib.lmul import lmul_simple
+from .rtllib.multipliers import float_multiplier
+from .dtypes import Float8, BF16
+from .rtllib import (
+    CompiledAcceleratorConfig,
+)
+from .simulation import CompiledAcceleratorSimulator
+from .analysis.hardware_stats import (
+    calculate_hardware_stats,
+)
+__all__ = ["create_app"]
+# ------------ CONSTANTS ------------ #
+# Load the component data
+data_path = os.environ.get("COMPONENT_DATA_PATH", "results/component_data.csv")
+DF = pd.read_csv(data_path)
+# Load the trained model
+MODEL = load_model("models/mlp_mnist.pth", "cpu")  # type: ignore
+MODEL.eval()
+classes = [
+    "zero",
+    "one",
+    "two",
+    "three",
+    "four",
+    "five",
+    "six",
+    "seven",
+    "eight",
+    "nine",
+]
+labels_value = {label: 0.0 for label in classes}
+accelerator_dtypes = ["float8", "bfloat16", "float32"]
+dtype_map = {
+    "float8": Float8,
+    "bfloat16": BF16,
+    "float32": BF16,  # TODO: use Float32, but not right now because its slow
+}
+mult_map = {
+    "IEEE 754": float_multiplier,
+    "l-mul": lmul_simple,
+}
+# ------------ Event Listener Functions ------------ #
+def filter_activation_types(weight_type: str, activation_type: str):
+    if weight_type == "float8":
+        return gr.update(choices=accelerator_dtypes)
+    elif weight_type == "bfloat16":
+        if activation_type == "float8":
+            activation_type = "bfloat16"
+        return gr.update(value=activation_type, choices=["bfloat16", "float32"])
+    elif weight_type == "float32":
+        if activation_type != "float32":
+            activation_type = "float32"
+        return gr.update(value=activation_type, choices=["float32"])
+def warn_w8a8(weight_type: str, activation_type: str):
+    if weight_type == "float8" and activation_type == "float8":
+        gr.Warning(
+            "W8A8 has poor performance without quantization, which is not yet supported in simulation. Theoretical results are still calculated for FP8 hardware",
+            duration=5,
+        )
+def image_to_tensor(sketchpad: EditorValue):
+    image = sketchpad["composite"]
+    image = image.resize((28, 28), Image.Resampling.LANCZOS)  # type: ignore
+    img_array = np.transpose(np.array(image), (2, 0, 1))[-1]
+    # Preprocessing: convert image to tensor and normalize
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,)),
+        ]
+    )
+    tensor_image = transform(img_array)
+    return tensor_image
+def calculate_stats(
+    activation_type: Literal["float8", "bfloat16", "float32"],
+    weight_type: Literal["float8", "bfloat16", "float32"],
+    systolic_array_size: int,
+    num_accelerator_cores: int,
+    fast_internals: Literal["Fast", "Efficient"],
+    pipeline_level: Literal["None", "Low", "Full"],
+    process_node_size: Literal["7nm", "45nm", "130nm"],
+):
+    """
+    Calculate hardware statistics for both lmul and standard IEEE multiplier configurations.
+    Args:
+        activation_type: Type of activations
+        weight_type: Type of weights
+        systolic_array_size: Size of the systolic array
+        num_accelerator_cores: Number of accelerator cores
+        fast_internals: Whether to use fast or efficient components
+        pipeline_level: Level of pipelining
+        process_node_size: Process node size (ignored for now)
+    Returns:
+        Tuple of (lmul_metrics, ieee_metrics, comparison_metrics) dictionaries
+    """
+    stat_map = {
+        "float8": "fp8",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "Fast": True,
+        "Efficient": False,
+        "None": 0,
+        "Low": 1,
+        "None": "combinational",
+        "Low": "combinational",
+        "Full": "pipelined",
+        "7nm": 7,
+        "45nm": 45,
+        "130nm": 130,
+    }
+    # Calculate hardware stats using the functions from hardware_stats.py
+    lmul_metrics, ieee_metrics = calculate_hardware_stats(
+        DF,
+        activation_type,
+        weight_type,
+        systolic_array_size,
+        num_accelerator_cores,
+        fast_internals,
+        pipeline_level,
+        process_node_size,
+    )
+    # comparison_metrics = calculate_comparison_metrics(lmul_metrics, ieee_metrics)
+    # Format the metrics for display in the Gradio UI
+    lmul_html = "<div style='text-align: left;'>"
+    for key, value in lmul_metrics.items():
+        lmul_html += f"<p><b>{key}:</b> {value}</p>"
+    lmul_html += "</div>"
+    ieee_html = "<div style='text-align: left;'>"
+    for key, value in ieee_metrics.items():
+        ieee_html += f"<p><b>{key}:</b> {value}</p>"
+    ieee_html += "</div>"
+    # comparison_html = "<div style='text-align: left;'>"
+    # comparison_html += "<h3>Comparison (lmul vs IEEE)</h3>"
+    # for key, value in comparison_metrics.items():
+    #     comparison_html += f"<p><b>{key}:</b> {value}</p>"
+    # comparison_html += "</div>"
+    return (
+        lmul_html,
+        ieee_html,
+        # comparison_html,
+    )
+def predict_lmul(
+    sketchpad: EditorValue,
+    weight: str,
+    activation: str,
+    gr_progress=gr.Progress(track_tqdm=True),
+):
+    if weight == "float8" and activation == "float8":
+        activation = "bfloat16"
+    config = CompiledAcceleratorConfig(
+        array_size=8,
+        activation_type=dtype_map[activation],
+        weight_type=dtype_map[weight],
+        multiplier=lmul_simple,
+    )
+    sim = CompiledAcceleratorSimulator(config, MODEL)
+    x = image_to_tensor(sketchpad).detach().numpy().flatten()
+    probabilities = sim.predict(x)
+    return {cls: float(prob) for cls, prob in zip(classes, probabilities)}
+def predict_ieee(
+    sketchpad: EditorValue,
+    weight: str,
+    activation: str,
+    gr_progress=gr.Progress(track_tqdm=True),
+):
+    if weight == "float8" and activation == "float8":
+        activation = "bfloat16"
+    config = CompiledAcceleratorConfig(
+        array_size=8,
+        activation_type=dtype_map[activation],
+        weight_type=dtype_map[weight],
+        multiplier=float_multiplier,
+    )
+    simulator = CompiledAcceleratorSimulator(config, MODEL)
+    x = image_to_tensor(sketchpad).detach().numpy().flatten()
+    probabilities = simulator.predict(x)
+    return {cls: float(prob) for cls, prob in zip(classes, probabilities)}
+# ------------ Blocks UI Layout ------------ #
+def create_app():
+    with gr.Blocks(fill_height=False, fill_width=False, title=__file__) as demo:
+        gr.Markdown("## Draw a digit to see the model's prediction")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=3):
+                canvas_size = (400, 400)
+                sketchpad = gr.Sketchpad(
+                    # label="Draw a digit",
+                    type="pil",  # Changed to PIL
+                    transforms=(),
+                    layers=False,
+                    canvas_size=canvas_size,
+                    # scale=2,
+                    container=False,
+                )
+                predict_btn = gr.Button(
+                    "Run Hardware Simulation",
+                    variant="primary",
+                )
+                # with gr.Accordion("Accelerator Configuration", open=True):
+                with gr.Group():
+                    with gr.Row():  # Weight and activation types
+                        weight_type_component = gr.Radio(
+                            label="Weights d-type",
+                            choices=accelerator_dtypes,
+                            value="float8",
+                            interactive=True,
+                        )
+                        activation_type_component = gr.Radio(
+                            label="Activations d-type",
+                            choices=accelerator_dtypes,
+                            value="bfloat16",
+                            interactive=True,
+                        )
+                        # Prevent w8a8 from being selected, or any other combination where act < weight
+                        weight_type_component.select(
+                            fn=filter_activation_types,
+                            inputs=[weight_type_component, activation_type_component],
+                            outputs=activation_type_component,
+                        )
+                        gr.on(
+                            triggers=[
+                                weight_type_component.select,
+                                activation_type_component.select,
+                            ],
+                            fn=warn_w8a8,
+                            inputs=[weight_type_component, activation_type_component],
+                        )
+                    with gr.Row():
+                        systolic_array_size_component = gr.Slider(
+                            label="Systolic Array Size",
+                            info="Dimensions of the matrix acceleration unit",
+                            minimum=4,
+                            maximum=512,
+                            step=1,
+                            value=16,
+                            interactive=True,
+                        )
+                        num_accelerator_cores_component = gr.Number(
+                            label="Number of Accelerator Cores",
+                            info="Total number of accelerator units per chip",
+                            minimum=1,
+                            maximum=1024,
+                            step=1,
+                            value=1,
+                            interactive=True,
+                        )
+                    with gr.Row(equal_height=True):
+                        fast_internals_component = gr.Dropdown(
+                            label="Internal Component Type",
+                            info="Configure the lowest level hardware units to use a faster or more efficient design.",
+                            choices=["Fast", "Efficient"],
+                            value="Fast",
+                            interactive=True,
+                            filterable=False,
+                        )
+                        pipeline_level_component = gr.Dropdown(
+                            label="Pipeline Level",
+                            info="Configure the pipeline level of processing elements within the accelerator. Low uses a single register between multipliers and adders. Full uses pipelined individual components.",
+                            choices=["None", "Low", "Full"],
+                            value="Full",
+                            interactive=True,
+                            filterable=False,
+                        )
+                        process_node_size_component = gr.Radio(
+                            label="Process Node Size",
+                            info="Configure the process node size of the hardware units. Smaller nodes are faster and use less area.",
+                            choices=["7nm", "45nm", "130nm"],
+                            value="45nm",
+                            interactive=False,
+                        )
+            with gr.Column(scale=2):
+                lmul_predictions = gr.Label(
+                    label="l-mul Simulator Predictions",
+                    value=labels_value,
+                    min_width=100,
+                )
+                lmul_html = gr.HTML(
+                    label="L-mul Hardware Stats",
+                    show_label=True,
+                    container=True,
+                )
+            with gr.Column(scale=2):
+                ieee_predictions = gr.Label(
+                    label="Hardware Simulator Predictions",
+                    value=labels_value,
+                    min_width=100,
+                )
+                ieee_html = gr.HTML(
+                    label="IEEE Multiplier Hardware Stats",
+                    show_label=True,
+                    container=True,
+                )
+        # ------------ Event Listeners ------------ #
+        predict_btn.click(
+            fn=predict_ieee,
+            inputs=[sketchpad, weight_type_component, activation_type_component],
+            outputs=ieee_predictions,
+        )
+        # TODO: implement simulator_predict
+        predict_btn.click(
+            fn=predict_lmul,
+            inputs=[sketchpad, weight_type_component, activation_type_component],
+            outputs=lmul_predictions,
+        )
+        gr.on(
+            triggers=[
+                demo.load,
+                activation_type_component.change,
+                weight_type_component.change,
+                systolic_array_size_component.change,
+                num_accelerator_cores_component.change,
+                fast_internals_component.change,
+                pipeline_level_component.change,
+                process_node_size_component.change,
+            ],
+            fn=calculate_stats,
+            inputs=[
+                activation_type_component,
+                weight_type_component,
+                systolic_array_size_component,
+                num_accelerator_cores_component,
+                fast_internals_component,
+                pipeline_level_component,
+                process_node_size_component,
+            ],
+            outputs=[lmul_html, ieee_html],
+            show_progress="hidden",
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_app()
+    demo.queue()
+    demo.launch(share=False)

hardware_accelerators/compile.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import time
+import multiprocessing
+import pyrtl
+from tqdm import tqdm
+from functools import partial
+from .simulation import CompiledAcceleratorSimulator
+from .rtllib import float_multiplier, lmul_simple
+from .rtllib.accelerator import CompiledAcceleratorConfig
+from .dtypes import BaseFloat, Float32, Float16, BF16, Float8
+from typing import Iterator, Type, List, Callable
+from itertools import product
+def generate_accelerator_configs(
+    array_size: int = 8,
+    dtypes: List[Type[BaseFloat]] | None = None,
+    multipliers: List[Callable] | None = None,
+    **kwargs,
+) -> Iterator[CompiledAcceleratorConfig]:
+    """
+    Generate all valid CompiledAcceleratorConfig combinations.
+    Args:
+        array_size: Size of the systolic array
+        dtypes: List of data types to consider. Defaults to [Float8, BF16, FP16, FP32]
+        multipliers: List of multiplier functions. Defaults to [float_multiplier, lmul]
+    Yields:
+        Valid CompiledAcceleratorConfig objects
+    Restrictions:
+        1. The activation_type must be greater than or equal to the weight_type in terms of bitwidth.
+        2. 16-bit float types (BF16, FP16) should not be combined with each other.
+           They should only pair with themselves or with FP32.
+    """
+    if dtypes is None:
+        dtypes = [Float8, BF16, Float32]
+    if multipliers is None:
+        multipliers = [float_multiplier, lmul_simple]
+    # Sort dtypes by bitwidth for easier comparison
+    dtype_bitwidths = {dtype: dtype.bitwidth() for dtype in dtypes}
+    sorted_dtypes = sorted(dtypes, key=lambda d: dtype_bitwidths[d])
+    # Identify 16-bit float types
+    bit16_float_types = [dtype for dtype in dtypes if dtype_bitwidths[dtype] == 16]
+    # Generate all combinations
+    for multiplier in multipliers:
+        for weight_type in sorted_dtypes:
+            # Find valid activation types based on bitwidth
+            valid_activation_types = [
+                dtype
+                for dtype in sorted_dtypes
+                if dtype_bitwidths[dtype] >= dtype_bitwidths[weight_type]
+            ]
+            for activation_type in valid_activation_types:
+                # Skip invalid combinations of 16-bit float types
+                if (
+                    weight_type in bit16_float_types
+                    and activation_type in bit16_float_types
+                    and weight_type != activation_type
+                ):
+                    continue
+                yield CompiledAcceleratorConfig(
+                    array_size=array_size,
+                    activation_type=activation_type,
+                    weight_type=weight_type,
+                    multiplier=multiplier,
+                    **kwargs,
+                )
+def compile_and_save_simulator(config):
+    """Compile and save a simulator for a given configuration.
+    Args:
+        config: The CompiledAcceleratorConfig to use
+    Returns:
+        Tuple of (config, success, time_taken)
+    """
+    start_time = time.time()
+    try:
+        # Create the simulator
+        with pyrtl.temp_working_block():
+            CompiledAcceleratorSimulator(config)
+        end_time = time.time()
+        return (config, True, end_time - start_time)
+    except Exception as e:
+        end_time = time.time()
+        print(f"Error compiling {config}: {str(e)}")
+        return (config, False, end_time - start_time)
+def compile_all_simulators(configs=None, max_workers=None):
+    """Compile and save simulators for all configurations using multiprocessing.
+    Args:
+        configs: List of configurations to compile. If None, generates all valid configs.
+        base_dir: Base directory to save simulations
+        max_workers: Maximum number of worker processes. If None, uses CPU count.
+    Returns:
+        List of results (config, success, time_taken)
+    """
+    if configs is None:
+        configs = list(generate_accelerator_configs())
+    if max_workers is None:
+        max_workers = multiprocessing.cpu_count()
+    print(f"Compiling {len(configs)} configurations using {max_workers} workers")
+    # Create a partial function with the base_dir parameter
+    compile_func = partial(compile_and_save_simulator)
+    # Use multiprocessing to compile all configurations
+    with multiprocessing.Pool(processes=max_workers) as pool:
+        # Use tqdm to show progress
+        results = list(
+            tqdm(
+                pool.imap(compile_func, configs),
+                total=len(configs),
+                desc="Compiling simulators",
+            )
+        )
+    # Print summary
+    successful = [r for r in results if r[1]]
+    failed = [r for r in results if not r[1]]
+    print(f"\nCompilation complete:")
+    print(f"  Total: {len(results)}")
+    print(f"  Successful: {len(successful)}")
+    print(f"  Failed: {len(failed)}")
+    if successful:
+        avg_time = sum(r[2] for r in successful) / len(successful)
+        print(f"  Average compilation time: {avg_time:.2f} seconds")
+    return results
+if __name__ == "__main__":
+    # Generate all valid configurations
+    all_configs = list(generate_accelerator_configs())
+    print(f"Generated {len(all_configs)} configs")
+    # Compile and save simulators for all configurations
+    results = compile_all_simulators(all_configs)
+    # Print details of failed compilations if any
+    failed = [r for r in results if not r[1]]
+    if failed:
+        print("\nFailed compilations:")
+        for config, _, _ in failed:
+            print(config.name)

hardware_accelerators/dtypes/__init__.py CHANGED Viewed

@@ -2,5 +2,7 @@
 from .base import BaseFloat
 from .bfloat16 import BF16
 from .float8 import Float8
-__all__ = ["BaseFloat", "Float8", "BF16"]

 from .base import BaseFloat
 from .bfloat16 import BF16
 from .float8 import Float8
+from .float16 import Float16
+from .float32 import Float32
+__all__ = ["BaseFloat", "Float8", "BF16", "Float16", "Float32"]

hardware_accelerators/dtypes/base.py CHANGED Viewed

@@ -56,6 +56,11 @@ class BaseFloat(ABC):
         else:
             raise ValueError("Must provide one of: value, binary, or binint")
     @classmethod
     @abstractmethod
     def format_spec(cls) -> FormatSpec:
@@ -160,7 +165,6 @@ class BaseFloat(ABC):
     def _format_binary_string(self, binary=None) -> str:
         """Format binary string with dots for readability"""
-        # Clean the input string first
         if binary is None:
             binary = self.binary
         clean_binary = "".join(c for c in binary if c in "01")
@@ -169,8 +173,13 @@ class BaseFloat(ABC):
         if self.bitwidth() == 8:  # Float8
             return f"{clean_binary[0]}.{clean_binary[1:5]}.{clean_binary[5:]}"
-        elif self.bitwidth() == 16:  # BF16
-            return clean_binary  # BF16 doesn't use dot formatting
         else:
             return clean_binary

         else:
             raise ValueError("Must provide one of: value, binary, or binint")
+    @classmethod
+    @abstractmethod
+    def binary_max(cls) -> int:
+        pass
     @classmethod
     @abstractmethod
     def format_spec(cls) -> FormatSpec:
     def _format_binary_string(self, binary=None) -> str:
         """Format binary string with dots for readability"""
         if binary is None:
             binary = self.binary
         clean_binary = "".join(c for c in binary if c in "01")
         if self.bitwidth() == 8:  # Float8
             return f"{clean_binary[0]}.{clean_binary[1:5]}.{clean_binary[5:]}"
+        elif self.bitwidth() == 32:  # Float32
+            return f"{clean_binary[0]}.{clean_binary[1:9]}.{clean_binary[9:]}"
+        elif self.bitwidth() == 16:
+            if self.__class__.__name__ == "Float16":  # Float16
+                return f"{clean_binary[0]}.{clean_binary[1:6]}.{clean_binary[6:]}"
+            else:  # BF16
+                return clean_binary
         else:
             return clean_binary

hardware_accelerators/dtypes/bfloat16.py CHANGED Viewed

@@ -29,6 +29,10 @@ class BF16(BaseFloat):
             min_subnormal=2**-126 * (1 / 128),
         )
     def _float32_to_bf16_parts(self, f32: float) -> Tuple[int, int, int]:
         """Convert float32 to BF16 parts (sign, exponent, mantissa)"""
         # Get binary representation of float32

             min_subnormal=2**-126 * (1 / 128),
         )
+    @classmethod
+    def binary_max(cls) -> int:
+        return 0b0111111101111111
     def _float32_to_bf16_parts(self, f32: float) -> Tuple[int, int, int]:
         """Convert float32 to BF16 parts (sign, exponent, mantissa)"""
         # Get binary representation of float32

hardware_accelerators/dtypes/float16.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from .base import BaseFloat, FormatSpec
+class Float16(BaseFloat):
+    """
+    16-bit floating point number with IEEE 754 half-precision format
+    - 1 sign bit
+    - 5 exponent bits (bias 15)
+    - 10 mantissa bits
+    """
+    @classmethod
+    def format_spec(cls) -> FormatSpec:
+        return FormatSpec(
+            total_bits=16,
+            exponent_bits=5,
+            mantissa_bits=10,
+            bias=15,
+            max_normal=65504.0,  # from 0.11110.1111111111
+            min_normal=2**-14,  # from 0.00001.0000000000
+            max_subnormal=2**-14 * (1023 / 1024),  # from 0.00000.1111111111
+            min_subnormal=2**-24,  # from 0.00000.0000000001
+        )
+    @classmethod
+    def binary_max(cls) -> int:
+        return 0b0111101111111111
+    def _decimal_to_binary(self, num: float) -> str:
+        """Convert decimal number to binary string in IEEE 754 format"""
+        if num == 0:
+            return "0.00000.0000000000"
+        # Extract sign bit
+        sign = "1" if num < 0 else "0"
+        num = abs(num)
+        # Handle NaN
+        if num != num:  # Python's way to check for NaN
+            return sign + ".11111.1111111111"
+        # Clamp to max value if overflow
+        if num > self.max_normal():
+            return "0.11110.1111111111" if sign == "0" else "1.11110.1111111111"
+        # Find exponent and normalized mantissa
+        exp = 0
+        temp = num
+        # Handle normal numbers
+        while temp >= 2 and exp < 31:
+            temp /= 2
+            exp += 1
+        while temp < 1 and exp > -14:
+            temp *= 2
+            exp -= 1
+        # Handle subnormal numbers
+        if exp <= -14:
+            # Shift mantissa right and adjust
+            shift = -14 - exp
+            temp /= 2**shift
+            exp = -14
+        # Calculate biased exponent
+        if temp < 1:  # Subnormal
+            biased_exp = "00000"
+        else:  # Normal
+            biased_exp = format(exp + self.bias(), "05b")
+        # Calculate mantissa bits
+        if temp < 1:  # Subnormal
+            mantissa_value = int(temp * (2 ** self.mantissa_bits()))
+        else:  # Normal
+            mantissa_value = int((temp - 1) * (2 ** self.mantissa_bits()))
+        mantissa = format(mantissa_value, f"0{self.mantissa_bits()}b")
+        return f"{sign}.{biased_exp}.{mantissa}"
+    def _binary_to_decimal(self, binary: str) -> float:
+        """Convert binary string in IEEE 754 format to decimal"""
+        # Clean up binary string
+        binary = "".join(c for c in binary if c in "01")
+        # Extract components
+        sign = -1 if binary[0] == "1" else 1
+        exp = binary[1:6]
+        mantissa = binary[6:]
+        # Handle special cases
+        if exp == "11111" and mantissa == "1111111111":  # NaN representation
+            return float("nan")
+        if exp == "00000" and mantissa == "0000000000":
+            return 0.0
+        # Convert biased exponent
+        biased_exp = int(exp, 2)
+        if biased_exp == 0:  # Subnormal number
+            actual_exp = -14
+            mantissa_value = int(mantissa, 2) / (2 ** self.mantissa_bits())
+            return sign * (2**actual_exp) * mantissa_value
+        else:  # Normal number
+            actual_exp = biased_exp - self.bias()
+            mantissa_value = 1 + int(mantissa, 2) / (2 ** self.mantissa_bits())
+            return sign * (2**actual_exp) * mantissa_value
+    @classmethod
+    def from_bits(cls, bits: int) -> "Float16":
+        """Create Float16 from 16-bit integer"""
+        return cls(binint=bits)
+    @classmethod
+    def nan(cls) -> "Float16":
+        """Create NaN value"""
+        return cls(binary="1.11111.1111111111")
+    @classmethod
+    def max_value(cls) -> "Float16":
+        """Create maximum representable value"""
+        return cls(binary="0.11110.1111111111")
+    @classmethod
+    def min_value(cls) -> "Float16":
+        """Create minimum representable normal value"""
+        return cls(binary="0.00001.0000000000")
+    @classmethod
+    def min_subnormal(cls) -> "Float16":
+        """Create minimum representable subnormal value"""
+        return cls(binary="0.00000.0000000001")
+    def detailed_breakdown(self) -> dict:
+        """Provide detailed breakdown of the Float16 number components"""
+        binary = "".join(c for c in self.binary if c in "01")
+        sign_bit = int(binary[0])
+        exp_bits = binary[1:6]
+        mantissa_bits = binary[6:]
+        exp_val = int(exp_bits, 2)
+        mantissa_val = int(mantissa_bits, 2)
+        is_normal = exp_val != 0 and exp_val != 31
+        is_subnormal = exp_val == 0 and mantissa_val != 0
+        is_zero = exp_val == 0 and mantissa_val == 0
+        is_nan = (
+            exp_val == 31 and mantissa_val == 1023
+        )  # Only s.11111.1111111111 is NaN
+        return {
+            "binary": self.binary,
+            "sign": sign_bit,
+            "exponent_bits": exp_bits,
+            "exponent_value": (exp_val - self.bias() if exp_val != 0 else "subnormal"),
+            "mantissa_bits": mantissa_bits,
+            "mantissa_value": mantissa_val,
+            "decimal_approx": self.decimal_approx,
+            "original_value": self.original_value,
+            "is_normal": is_normal,
+            "is_subnormal": is_subnormal,
+            "is_zero": is_zero,
+            "is_nan": is_nan,
+            "normalized_value": (
+                (1 + mantissa_val / 1024) if is_normal else (mantissa_val / 1024)
+            ),
+        }

hardware_accelerators/dtypes/float32.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from .base import BaseFloat, FormatSpec
+class Float32(BaseFloat):
+    """
+    32-bit floating point number with IEEE 754 single-precision format
+    - 1 sign bit
+    - 8 exponent bits (bias 127)
+    - 23 mantissa bits
+    """
+    @classmethod
+    def format_spec(cls) -> FormatSpec:
+        return FormatSpec(
+            total_bits=32,
+            exponent_bits=8,
+            mantissa_bits=23,
+            bias=127,
+            max_normal=3.4028235e38,  # from 0.11111110.11111111111111111111111
+            min_normal=2**-126,  # from 0.00000001.00000000000000000000000
+            max_subnormal=2**-126
+            * (8388607 / 8388608),  # from 0.00000000.11111111111111111111111
+            min_subnormal=2**-149,  # from 0.00000000.00000000000000000000001
+        )
+    @classmethod
+    def binary_max(cls) -> int:
+        return 0b01111111011111111111111111111111
+    def _decimal_to_binary(self, num: float) -> str:
+        """Convert decimal number to binary string in IEEE 754 format"""
+        if num == 0:
+            return "0.00000000.00000000000000000000000"
+        # Extract sign bit
+        sign = "1" if num < 0 else "0"
+        num = abs(num)
+        # Handle NaN
+        if num != num:  # Python's way to check for NaN
+            return sign + ".11111111.11111111111111111111111"
+        # Clamp to max value if overflow
+        if num > self.max_normal():
+            return (
+                "0.11111110.11111111111111111111111"
+                if sign == "0"
+                else "1.11111110.11111111111111111111111"
+            )
+        # Find exponent and normalized mantissa
+        exp = 0
+        temp = num
+        # Handle normal numbers
+        while temp >= 2 and exp < 255:
+            temp /= 2
+            exp += 1
+        while temp < 1 and exp > -126:
+            temp *= 2
+            exp -= 1
+        # Handle subnormal numbers
+        if exp <= -126:
+            # Shift mantissa right and adjust
+            shift = -126 - exp
+            temp /= 2**shift
+            exp = -126
+        # Calculate biased exponent
+        if temp < 1:  # Subnormal
+            biased_exp = "00000000"
+        else:  # Normal
+            biased_exp = format(exp + self.bias(), "08b")
+        # Calculate mantissa bits
+        if temp < 1:  # Subnormal
+            mantissa_value = int(temp * (2 ** self.mantissa_bits()))
+        else:  # Normal
+            mantissa_value = int((temp - 1) * (2 ** self.mantissa_bits()))
+        mantissa = format(mantissa_value, f"0{self.mantissa_bits()}b")
+        return f"{sign}.{biased_exp}.{mantissa}"
+    def _binary_to_decimal(self, binary: str) -> float:
+        """Convert binary string in IEEE 754 format to decimal"""
+        # Clean up binary string
+        binary = "".join(c for c in binary if c in "01")
+        # Extract components
+        sign = -1 if binary[0] == "1" else 1
+        exp = binary[1:9]
+        mantissa = binary[9:]
+        # Handle special cases
+        if (
+            exp == "11111111" and mantissa == "11111111111111111111111"
+        ):  # NaN representation
+            return float("nan")
+        if exp == "00000000" and mantissa == "00000000000000000000000":
+            return 0.0
+        # Convert biased exponent
+        biased_exp = int(exp, 2)
+        if biased_exp == 0:  # Subnormal number
+            actual_exp = -126
+            mantissa_value = int(mantissa, 2) / (2 ** self.mantissa_bits())
+            return sign * (2**actual_exp) * mantissa_value
+        else:  # Normal number
+            actual_exp = biased_exp - self.bias()
+            mantissa_value = 1 + int(mantissa, 2) / (2 ** self.mantissa_bits())
+            return sign * (2**actual_exp) * mantissa_value
+    @classmethod
+    def from_bits(cls, bits: int) -> "Float32":
+        """Create Float32 from 32-bit integer"""
+        return cls(binint=bits)
+    @classmethod
+    def nan(cls) -> "Float32":
+        """Create NaN value"""
+        return cls(binary="1.11111111.11111111111111111111111")
+    @classmethod
+    def max_value(cls) -> "Float32":
+        """Create maximum representable value"""
+        return cls(binary="0.11111110.11111111111111111111111")
+    @classmethod
+    def min_value(cls) -> "Float32":
+        """Create minimum representable normal value"""
+        return cls(binary="0.00000001.00000000000000000000000")
+    @classmethod
+    def min_subnormal(cls) -> "Float32":
+        """Create minimum representable subnormal value"""
+        return cls(binary="0.00000000.00000000000000000000001")
+    def detailed_breakdown(self) -> dict:
+        """Provide detailed breakdown of the Float32 number components"""
+        binary = "".join(c for c in self.binary if c in "01")
+        sign_bit = int(binary[0])
+        exp_bits = binary[1:9]
+        mantissa_bits = binary[9:]
+        exp_val = int(exp_bits, 2)
+        mantissa_val = int(mantissa_bits, 2)
+        is_normal = exp_val != 0 and exp_val != 255
+        is_subnormal = exp_val == 0 and mantissa_val != 0
+        is_zero = exp_val == 0 and mantissa_val == 0
+        is_nan = (
+            exp_val == 255 and mantissa_val == 8388607
+        )  # Only s.11111111.11111111111111111111111 is NaN
+        return {
+            "binary": self.binary,
+            "sign": sign_bit,
+            "exponent_bits": exp_bits,
+            "exponent_value": (exp_val - self.bias() if exp_val != 0 else "subnormal"),
+            "mantissa_bits": mantissa_bits,
+            "mantissa_value": mantissa_val,
+            "decimal_approx": self.decimal_approx,
+            "original_value": self.original_value,
+            "is_normal": is_normal,
+            "is_subnormal": is_subnormal,
+            "is_zero": is_zero,
+            "is_nan": is_nan,
+            "normalized_value": (
+                (1 + mantissa_val / 8388608) if is_normal else (mantissa_val / 8388608)
+            ),
+        }

hardware_accelerators/dtypes/float8.py CHANGED Viewed

@@ -23,6 +23,10 @@ class Float8(BaseFloat):
             min_subnormal=2**-6 * (1 / 8),  # from 0.0000.001
         )
     def _decimal_to_binary(self, num: float) -> str:
         """Convert decimal number to binary string in E4M3 format"""
         if num == 0:

             min_subnormal=2**-6 * (1 / 8),  # from 0.0000.001
         )
+    @classmethod
+    def binary_max(cls) -> int:
+        return 0b01111110
     def _decimal_to_binary(self, num: float) -> str:
         """Convert decimal number to binary string in E4M3 format"""
         if num == 0:

hardware_accelerators/nn/lmul.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+import math
+import time
+# Custom approximate matrix multiplication using lmul
+def lmul_matmul(A: torch.Tensor, B: torch.Tensor, dtype=torch.float32):
+    """
+    Approximate matrix multiplication between A (m x n) and B (n x p)
+    using bitwise operations to mimic multiplication.
+    """
+    if dtype == torch.float32:
+        # reinterpret bits as uint32 then convert to int64 for arithmetic
+        A_int = A.contiguous().view(torch.uint32).to(torch.int64)
+        B_int = B.contiguous().view(torch.uint32).to(torch.int64)
+        offset = 1064828928  # offset for float32
+    elif dtype == torch.bfloat16:
+        A_int = A.contiguous().view(torch.uint16).to(torch.int64)
+        B_int = B.contiguous().view(torch.uint16).to(torch.int64)
+        offset = 16248  # offset for bfloat16
+    else:
+        raise ValueError("Unsupported dtype")
+    # A is (m, n) and B is (n, p).
+    # Expand dims so that:
+    # A_int becomes (m, n, 1) and B_int becomes (1, n, p)
+    prod_int = A_int.unsqueeze(2) + B_int.unsqueeze(0) - offset  # shape: (m, n, p)
+    # Convert the integer result back to floating point.
+    if dtype == torch.float32:
+        prod = prod_int.to(torch.uint32).view(torch.float32)
+    else:
+        prod = prod_int.to(torch.uint16).view(torch.bfloat16)
+    # Sum over the inner dimension to complete the dot product.
+    return prod.sum(dim=1)
+# Custom linear layer that uses lmul-based matrix multiplication
+class LmulLinear(nn.Module):
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
+        super(LmulLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.dtype = dtype
+        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        # Initialize weights similarly to nn.Linear.
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, input):
+        # Compute the approximate matrix multiply:
+        # Note: input shape is (batch, in_features)
+        # weight.T shape is (in_features, out_features)
+        out = lmul_matmul(input, self.weight.t(), self.dtype)
+        if self.bias is not None:
+            out = out + self.bias  # add bias as usual
+        return out
+# MLP model using our custom lmul-based linear layers
+class LmulMLP(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes, dtype=torch.float32):
+        super(LmulMLP, self).__init__()
+        self.flatten = nn.Flatten()
+        self.fc1 = LmulLinear(input_size, hidden_size, bias=True, dtype=dtype)
+        self.relu = nn.ReLU()
+        self.fc2 = LmulLinear(hidden_size, num_classes, bias=True, dtype=dtype)
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+# Setup: use float32 for this example.
+dtype = torch.float32
+# Instantiate the model.
+# For MNIST: input size is 28x28 = 784, hidden layer of 128, output 10 classes.
+model = LmulMLP(input_size=784, hidden_size=128, num_classes=10, dtype=dtype)
+model.eval()  # set model to evaluation mode
+model.load_state_dict(torch.load("models/mlp_mnist_fp32.pth", weights_only=True))
+# Prepare the MNIST test dataset.
+transform = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,)),
+    ]
+)
+test_dataset = datasets.MNIST(
+    root="./data", train=False, transform=transform, download=True
+)
+test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+# Run inference on the test dataset and measure accuracy.
+correct = 0
+total = 0
+start_time = time.time()
+with torch.no_grad():
+    for images, labels in test_loader:
+        # Ensure images are in the right dtype
+        images = images.to(dtype)
+        outputs = model(images)
+        # Compute predictions
+        _, predicted = torch.max(outputs, 1)
+        total += labels.size(0)
+        correct += (predicted.cpu() == labels).sum().item()
+end_time = time.time()
+accuracy = correct / total * 100
+inference_time = end_time - start_time
+print(f"Test Accuracy: {accuracy:.2f}%")
+print(f"Inference Time on Test Set: {inference_time:.2f} seconds")

hardware_accelerators/nn/precision.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from .util import get_pytorch_device
+# Define the MLP model (unchanged)
+class MLP(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(MLP, self).__init__()
+        self.flatten = nn.Flatten()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        out = self.fc2(x)
+        return out
+# Helper function: adjust data to match the target dtype
+def convert_input(data, precision):
+    if precision == "fp16":
+        return data.half()
+    elif precision == "bf16":
+        return data.to(torch.bfloat16)
+    elif precision == "fp8":
+        # Note: torch.float8_e4m3 is experimental and may not be available
+        return data.to(torch.float8_e4m3fn)
+    return data  # fp32 (no conversion)
+# Training for one epoch
+def train_epoch(model, device, train_loader, optimizer, criterion, precision):
+    model.train()
+    running_loss = 0.0
+    progress_bar = tqdm(train_loader, desc="Training", leave=False)
+    for data, target in progress_bar:
+        # Convert inputs to the desired precision (targets remain integer)
+        data = convert_input(data, precision)
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        # Forward pass
+        outputs = model(data)
+        loss = criterion(outputs, target)
+        # Check for NaN and skip problematic batches
+        if torch.isnan(loss):
+            print("NaN loss detected in batch, skipping...")
+            continue
+        # Backward and optimize with gradient clipping
+        loss.backward()
+        # Apply gradient clipping to prevent exploding gradients
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        running_loss += loss.item()
+    if len(train_loader) > 0:
+        return running_loss / len(train_loader)
+    return 0.0
+# Evaluation loop
+def evaluate(model, device, test_loader, criterion, precision):
+    model.eval()
+    total_loss = 0.0
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data = convert_input(data, precision)
+            data, target = data.to(device), target.to(device)
+            outputs = model(data)
+            loss = criterion(outputs, target)
+            total_loss += loss.item()
+            _, predicted = torch.max(outputs, 1)
+            total += target.size(0)
+            correct += (predicted == target).sum().item()
+    avg_loss = total_loss / len(test_loader)
+    accuracy = 100.0 * correct / total
+    return avg_loss, accuracy
+# Main training function for a given precision variant
+def train_model(
+    precision,
+    batch_size=32,
+    hidden_size=128,
+    num_epochs=5,
+    learning_rate=0.001,
+    optimizer_name="adam",
+    weight_decay=0,
+    eps=1e-4,
+    model_save_path=None,
+):
+    print(f"\nTraining in {precision.upper()} mode:")
+    device = get_pytorch_device()
+    # Data transformation: images are loaded as FP32 by default
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    train_dataset = datasets.MNIST(
+        root="./data", train=True, download=True, transform=transform
+    )
+    test_dataset = datasets.MNIST(
+        root="./data", train=False, download=True, transform=transform
+    )
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    # Hyperparameters
+    input_size = 28 * 28  # MNIST images are 28x28
+    num_classes = 10
+    # Create the model and send to device
+    model = MLP(input_size, hidden_size, num_classes).to(device)
+    # Convert the model to the target precision (natively)
+    if precision == "fp16":
+        model = model.to(torch.float16)
+        # Use a smaller learning rate for half precision if not explicitly specified
+        if learning_rate == 0.001:  # If using the default value
+            learning_rate = 1e-4  # Lower learning rate for stability
+    elif precision == "bf16":
+        model = model.to(torch.bfloat16)
+    elif precision == "fp8":
+        # Ensure your PyTorch build/hardware supports float8_e4m3; otherwise, this will error.
+        model = model.to(torch.float8_e4m3fn)
+    # else, fp32 is already the default
+    # Select optimizer based on user input
+    if optimizer_name.lower() == "adam":
+        optimizer = optim.Adam(
+            model.parameters(), lr=learning_rate, eps=eps, weight_decay=weight_decay
+        )
+    elif optimizer_name.lower() == "sgd":
+        optimizer = optim.SGD(
+            model.parameters(),
+            lr=learning_rate,
+            momentum=0.9,
+            weight_decay=weight_decay,
+        )
+    elif optimizer_name.lower() == "adamw":
+        optimizer = optim.AdamW(
+            model.parameters(), lr=learning_rate, eps=eps, weight_decay=weight_decay
+        )
+    else:
+        print(f"Unknown optimizer: {optimizer_name}, defaulting to Adam")
+        optimizer = optim.Adam(
+            model.parameters(), lr=learning_rate, eps=eps, weight_decay=weight_decay
+        )
+    criterion = nn.CrossEntropyLoss()
+    print(
+        f"Training with: batch_size={batch_size}, hidden_size={hidden_size}, "
+        f"epochs={num_epochs}, lr={learning_rate}, optimizer={optimizer_name}"
+    )
+    # Training loop
+    for epoch in range(1, num_epochs + 1):
+        train_loss = train_epoch(
+            model, device, train_loader, optimizer, criterion, precision
+        )
+        # Check for NaN loss
+        if torch.isnan(torch.tensor([train_loss])):
+            print(f"NaN detected in epoch {epoch}, reducing learning rate")
+            for param_group in optimizer.param_groups:
+                param_group["lr"] *= 0.5
+        print(f"Epoch {epoch} Train Loss: {train_loss:.4f}")
+    # Evaluation on test set
+    test_loss, test_accuracy = evaluate(
+        model, device, test_loader, criterion, precision
+    )
+    print(
+        f"{precision.upper()} Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%"
+    )
+    # Optionally, save the model
+    if model_save_path:
+        save_path = model_save_path
+    else:
+        model_dir = "models"
+        os.makedirs(model_dir, exist_ok=True)
+        save_path = os.path.join(model_dir, f"mlp_mnist_{precision}.pth")
+    torch.save(model.state_dict(), save_path)
+    print(f"Model saved to {save_path}\n")
+# Main script to train a model in a specific precision
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Train MNIST model in a specific precision"
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp32",
+        choices=["fp32", "fp16", "bf16", "fp8"],
+        help="Precision type to train in (fp32, fp16, bf16, fp8)",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=32, help="Batch size for training"
+    )
+    parser.add_argument(
+        "--hidden-size", type=int, default=128, help="Hidden layer size for MLP"
+    )
+    parser.add_argument(
+        "--epochs", type=int, default=5, help="Number of training epochs"
+    )
+    parser.add_argument("--lr", type=float, default=0.001, help="Learning rate")
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="adam",
+        choices=["adam", "sgd", "adamw"],
+        help="Optimizer to use for training",
+    )
+    parser.add_argument(
+        "--weight-decay", type=float, default=0, help="Weight decay (L2 penalty)"
+    )
+    parser.add_argument(
+        "--eps", type=float, default=1e-4, help="Epsilon for Adam optimizer"
+    )
+    parser.add_argument(
+        "--save-path", type=str, default=None, help="Path to save the trained model"
+    )
+    args = parser.parse_args()
+    try:
+        train_model(
+            precision=args.dtype,
+            batch_size=args.batch_size,
+            hidden_size=args.hidden_size,
+            num_epochs=args.epochs,
+            learning_rate=args.lr,
+            optimizer_name=args.optimizer,
+            weight_decay=args.weight_decay,
+            eps=args.eps,
+            model_save_path=args.save_path,
+        )
+    except Exception as e:
+        print(f"Error training {args.dtype.upper()} model: {e}")

hardware_accelerators/nn/precision_eval.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+import torch.nn as nn
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+import numpy as np
+import time
+from pathlib import Path
+from .precision import MLP
+from .util import get_pytorch_device
+def load_mnist_data(batch_size=100):
+    """Load MNIST test dataset"""
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    test_dataset = datasets.MNIST(
+        root="./data", train=False, download=True, transform=transform
+    )
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    return test_loader
+def create_model(precision):
+    """Create MLP model with specified precision"""
+    input_size = 28 * 28  # MNIST images are 28x28
+    hidden_size = 128
+    num_classes = 10
+    device = get_pytorch_device()
+    model = MLP(input_size, hidden_size, num_classes).to(device)
+    # Convert model to target precision
+    if precision == "fp16":
+        model = model.to(torch.float16)
+    elif precision == "bf16":
+        model = model.to(torch.bfloat16)
+    elif precision == "fp32":
+        model = model.to(torch.float32)
+    return model, device
+def load_model_weights(model, model_path):
+    """Load model weights from checkpoint"""
+    model.load_state_dict(torch.load(model_path))
+    return model
+def evaluate_model(model, test_loader, device, precision):
+    """Evaluate model accuracy and inference time"""
+    model.eval()
+    correct = 0
+    total = 0
+    # For measuring inference time
+    start_time = time.time()
+    with torch.no_grad():
+        for data, target in test_loader:
+            # Convert input to specified precision
+            if precision == "fp16":
+                data = data.half()
+            elif precision == "bf16":
+                data = data.to(torch.bfloat16)
+            data, target = data.to(device), target.to(device)
+            # Forward pass
+            outputs = model(data)
+            # Calculate accuracy
+            _, predicted = torch.max(outputs, 1)
+            total += target.size(0)
+            correct += (predicted == target).sum().item()
+    inference_time = time.time() - start_time
+    accuracy = 100.0 * correct / total
+    return {
+        "accuracy": accuracy,
+        "inference_time": inference_time,
+        "correct": correct,
+        "total": total,
+    }
+def compare_precision_inference(fp32_model_path):
+    """Compare FP32 trained model inference in different precisions"""
+    print("\nEvaluating FP32-trained model inference with different precisions")
+    print("-" * 80)
+    # Load test data
+    test_loader = load_mnist_data()
+    # Verify the model file exists
+    model_path = Path(fp32_model_path)
+    if not model_path.exists():
+        print(f"Error: Model file {fp32_model_path} not found!")
+        return
+    # Create models in different precisions
+    precisions = ["fp32", "bf16"]
+    results = {}
+    for precision in precisions:
+        print(f"\nTesting inference in {precision.upper()} mode...")
+        # Create model in specified precision
+        model, device = create_model(precision)
+        # Load weights from the FP32-trained model
+        # When loading to a BF16 model, the weights will be automatically cast
+        model = load_model_weights(model, fp32_model_path)
+        # Evaluate model
+        results[precision] = evaluate_model(model, test_loader, device, precision)
+        # Print results
+        print(f"Accuracy: {results[precision]['accuracy']:.2f}%")
+        print(f"Inference time: {results[precision]['inference_time']:.4f} seconds")
+    # Calculate and print comparison metrics
+    print("\nComparison Summary")
+    print("-" * 80)
+    fp32_results = results["fp32"]
+    bf16_results = results["bf16"]
+    acc_diff = bf16_results["accuracy"] - fp32_results["accuracy"]
+    time_ratio = fp32_results["inference_time"] / bf16_results["inference_time"]
+    print(f"Accuracy drop from FP32 to BF16: {acc_diff:.2f}%")
+    print(f"Inference speedup with BF16: {time_ratio:.2f}x")
+    return results
+def detailed_precision_comparison(
+    fp32_model_path, trials=3, batch_sizes=[1, 16, 32, 64, 128, 256]
+):
+    """Run detailed comparison with multiple batch sizes and trials"""
+    print("\nDetailed Precision Comparison")
+    print("=" * 80)
+    # Verify the model file exists
+    model_path = Path(fp32_model_path)
+    if not model_path.exists():
+        print(f"Error: Model file {fp32_model_path} not found!")
+        return
+    precisions = ["fp32", "bf16"]
+    all_results = {}
+    for precision in precisions:
+        all_results[precision] = {}
+        print(f"\nEvaluating {precision.upper()} precision")
+        print("-" * 60)
+        # Create model in specified precision only once
+        model, device = create_model(precision)
+        model = load_model_weights(model, fp32_model_path)
+        # Warm up the GPU/CPU
+        print("Warming up...")
+        dummy_loader = load_mnist_data(batch_size=64)
+        with torch.no_grad():
+            for data, _ in dummy_loader:
+                if precision == "bf16":
+                    data = data.to(torch.bfloat16)
+                elif precision == "fp16":
+                    data = data.half()
+                data = data.to(device)
+                _ = model(data)
+                break
+        # Run trials for different batch sizes
+        for batch_size in batch_sizes:
+            print(f"\n  Batch size: {batch_size}")
+            test_loader = load_mnist_data(batch_size=batch_size)
+            batch_results = {"accuracy": [], "inference_time": []}
+            for trial in range(trials):
+                print(f"    Trial {trial+1}/{trials}...", end="", flush=True)
+                result = evaluate_model(model, test_loader, device, precision)
+                batch_results["accuracy"].append(result["accuracy"])
+                batch_results["inference_time"].append(result["inference_time"])
+                print(
+                    f" done. Time: {result['inference_time']:.4f}s, Accuracy: {result['accuracy']:.2f}%"
+                )
+            # Calculate averages
+            avg_accuracy = sum(batch_results["accuracy"]) / len(
+                batch_results["accuracy"]
+            )
+            avg_time = sum(batch_results["inference_time"]) / len(
+                batch_results["inference_time"]
+            )
+            all_results[precision][batch_size] = {
+                "avg_accuracy": avg_accuracy,
+                "avg_inference_time": avg_time,
+                "trials": batch_results,
+            }
+            print(f"    Average: {avg_time:.4f}s, Accuracy: {avg_accuracy:.2f}%")
+    # Print comparison table
+    print("\nComparison Results")
+    print("=" * 80)
+    print(
+        f"{'Batch Size':^10} | {'FP32 Acc':^10} | {'BF16 Acc':^10} | {'Acc Diff':^10} | {'FP32 Time':^10} | {'BF16 Time':^10} | {'Speedup':^10}"
+    )
+    print("-" * 80)
+    for batch_size in batch_sizes:
+        fp32_acc = all_results["fp32"][batch_size]["avg_accuracy"]
+        bf16_acc = all_results["bf16"][batch_size]["avg_accuracy"]
+        acc_diff = bf16_acc - fp32_acc
+        fp32_time = all_results["fp32"][batch_size]["avg_inference_time"]
+        bf16_time = all_results["bf16"][batch_size]["avg_inference_time"]
+        speedup = fp32_time / bf16_time
+        print(
+            f"{batch_size:^10} | {fp32_acc:^10.2f} | {bf16_acc:^10.2f} | {acc_diff:^10.2f} | {fp32_time:^10.4f} | {bf16_time:^10.4f} | {speedup:^10.2f}x"
+        )
+    # Calculate and print overall averages
+    avg_acc_diff = sum(
+        all_results["bf16"][bs]["avg_accuracy"]
+        - all_results["fp32"][bs]["avg_accuracy"]
+        for bs in batch_sizes
+    ) / len(batch_sizes)
+    avg_speedup = sum(
+        all_results["fp32"][bs]["avg_inference_time"]
+        / all_results["bf16"][bs]["avg_inference_time"]
+        for bs in batch_sizes
+    ) / len(batch_sizes)
+    print("-" * 80)
+    print(f"Average accuracy difference across all batch sizes: {avg_acc_diff:.2f}%")
+    print(f"Average speedup across all batch sizes: {avg_speedup:.2f}x")
+    return all_results
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Compare model inference with FP32 vs BF16"
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="models/mlp_mnist_fp32.pth",
+        help="Path to FP32 trained model weights",
+    )
+    parser.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Run detailed comparison with multiple batch sizes",
+    )
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=3,
+        help="Number of trials to run (only with --detailed)",
+    )
+    args = parser.parse_args()
+    if args.detailed:
+        detailed_precision_comparison(args.model_path, trials=args.trials)
+    else:
+        compare_precision_inference(args.model_path)

hardware_accelerators/nn/run_precision_comparison.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python3
+import os
+import argparse
+from hardware_accelerators.nn.precision import train_model
+from hardware_accelerators.nn.precision_eval import (
+    compare_precision_inference,
+    detailed_precision_comparison,
+)
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(
+        description="Train and evaluate precision differences"
+    )
+    parser.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Run detailed comparison with multiple batch sizes and trials",
+    )
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=3,
+        help="Number of trials to run for each batch size (only with --detailed)",
+    )
+    parser.add_argument(
+        "--force-train",
+        action="store_true",
+        help="Force training a new model even if one exists",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        nargs="+",
+        type=int,
+        default=[1, 16, 32, 64, 128, 256],
+        help="Batch sizes to test (only with --detailed)",
+    )
+    args = parser.parse_args()
+    # Create models directory if it doesn't exist
+    os.makedirs("models", exist_ok=True)
+    # Path to save/load model
+    model_path = "models/mlp_mnist_fp32.pth"
+    # Check if model exists, train if not or if forced
+    if not os.path.exists(model_path) or args.force_train:
+        print("Training a new FP32 model...")
+        # Train a model in FP32 precision
+        train_model(
+            precision="fp32",
+            batch_size=64,
+            hidden_size=128,
+            num_epochs=5,
+            learning_rate=0.001,
+            optimizer_name="adam",
+            model_save_path=model_path,
+        )
+    else:
+        print(f"Using existing model at {model_path}")
+    # Run comparison
+    if args.detailed:
+        print(
+            f"Running detailed comparison with {args.trials} trials for each batch size..."
+        )
+        detailed_precision_comparison(
+            model_path, trials=args.trials, batch_sizes=args.batch_sizes
+        )
+    else:
+        # Run basic comparison
+        compare_precision_inference(model_path)
+if __name__ == "__main__":
+    main()

hardware_accelerators/nn/train.py CHANGED Viewed

@@ -8,8 +8,6 @@ from tqdm.auto import tqdm
 from .util import model_factory, get_pytorch_device  # progress bar for notebooks
-# from pytorch2tikz import Architecture
 # Training function for one epoch
 def train(model, device, train_loader, optimizer, criterion, epoch, num_epochs):

 from .util import model_factory, get_pytorch_device  # progress bar for notebooks
 # Training function for one epoch
 def train(model, device, train_loader, optimizer, criterion, epoch, num_epochs):

hardware_accelerators/nn/util.py CHANGED Viewed

@@ -27,8 +27,10 @@ def load_model(model_path: str, device: torch.device | None = None):
     if device is None:
         device = get_pytorch_device()
     model = model_factory()
     model.to(device)
-    model.load_state_dict(torch.load(model_path, map_location=device))
     return model

     if device is None:
         device = get_pytorch_device()
     model = model_factory()
+    model.load_state_dict(
+        torch.load(model_path, map_location=device, weights_only=True)
+    )
     model.to(device)
     return model

hardware_accelerators/rtllib/__init__.py CHANGED Viewed

@@ -9,9 +9,13 @@ from .accelerator import (
     TiledMatrixEngine,
     AcceleratorConfig,
     Accelerator,
 )
-all = [
     "float_adder",
     "FloatAdderPipelined",
     "float_multiplier",
@@ -20,11 +24,15 @@ all = [
     "lmul_fast",
     "LmulPipelined",
     "SystolicArrayDiP",
-    "AccumulatorMemoryBank",
     "BufferMemory",
     "WeightFIFO",
     "TiledAcceleratorConfig",
     "TiledMatrixEngine",
     "AcceleratorConfig",
     "Accelerator",
 ]

     TiledMatrixEngine,
     AcceleratorConfig,
     Accelerator,
+    CompiledAcceleratorConfig,
+    CompiledAccelerator,
+    AcceleratorAnalysisConfig,
+    AcceleratorTopLevel,
 )
+__all__ = [
     "float_adder",
     "FloatAdderPipelined",
     "float_multiplier",
     "lmul_fast",
     "LmulPipelined",
     "SystolicArrayDiP",
+    "TiledAccumulatorMemoryBank",
     "BufferMemory",
     "WeightFIFO",
     "TiledAcceleratorConfig",
     "TiledMatrixEngine",
     "AcceleratorConfig",
     "Accelerator",
+    "CompiledAcceleratorConfig",
+    "CompiledAccelerator",
+    "AcceleratorAnalysisConfig",
+    "AcceleratorTopLevel",
 ]

hardware_accelerators/rtllib/accelerator.py CHANGED Viewed

@@ -1,99 +1,140 @@
 from dataclasses import dataclass
-from typing import Callable, Type, Dict
 import numpy as np
 from pyrtl import (
     WireVector,
     Register,
     Output,
     Simulation,
     concat,
 )
 from .buffer import BufferMemory, WeightFIFO
 from .systolic import SystolicArrayDiP
 from .accumulators import Accumulator, TiledAccumulatorMemoryBank
-from .activations import ReluUnit
 from ..dtypes import BaseFloat
-@dataclass
-class AcceleratorConfig:
-    """Configuration class for a systolic array accelerator.
-    This class defines the parameters and specifications for a systolic array
-    accelerator including array dimensions, data types, arithmetic operations,
-    and memory configuration.
-    """
     array_size: int
-    """Dimension of systolic array (always square)"""
-    num_weight_tiles: int
-    """Number of weight tiles in the FIFO. Each tile is equal to the size of the systolic array"""
-    data_type: Type[BaseFloat]
-    """Floating point format of input data to systolic array"""
     weight_type: Type[BaseFloat]
-    """Floating point format of weight inputs"""
-    accum_type: Type[BaseFloat]
-    """Floating point format to accumulate values in"""
-    pe_adder: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
-    """Function to generate adder hardware for the processing elements"""
-    accum_adder: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
-    """Function to generate adder hardware for the accumulator buffer"""
-    pe_multiplier: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
-    """Function to generate multiplier hardware for the processing elements"""
-    pipeline: bool
-    """Whether to add a pipeline stage in processing elements between multiplier and adder"""
-    accum_addr_width: int
-    """Address width for accumulator memory. Determines number of individually addressable locations"""
     @property
-    def weight_tile_addr_width(self):
-        """Get the width of the weight tile address bus in bits"""
-        return (self.num_weight_tiles - 1).bit_length()
-class Accelerator:
-    def __init__(self, config: AcceleratorConfig):
         self.config = config
         # Instantiate hardware components
-        self.fifo = WeightFIFO(
-            array_size=config.array_size,
-            num_tiles=config.num_weight_tiles,
-            dtype=config.weight_type,
-        )
         self.systolic_array = SystolicArrayDiP(
             size=config.array_size,
-            data_type=config.data_type,
             weight_type=config.weight_type,
-            accum_type=config.accum_type,
-            multiplier=config.pe_multiplier,
-            adder=config.pe_adder,
-            pipeline=config.pipeline,
         )
         self.accumulator = Accumulator(
-            addr_width=config.accum_addr_width,
             array_size=config.array_size,
-            data_type=config.accum_type,
-            adder=config.accum_adder,
         )
         self.activation = ReluUnit(
             size=config.array_size,
-            dtype=config.accum_type,
         )
         self.outputs = [
-            WireVector(config.accum_type.bitwidth()) for _ in range(config.array_size)
         ]
         # Connect components
@@ -103,12 +144,15 @@ class Accelerator:
         """Create unnamed WireVectors for control signals"""
         self.data_enable = WireVector(1)
         self.data_ins = [
-            WireVector(self.config.data_type.bitwidth())
             for _ in range(self.config.array_size)
         ]
-        self.weight_start_in = WireVector(1)
-        self.weight_tile_addr_in = WireVector(self.fifo.tile_addr_width)
         self.accum_addr_in = WireVector(self.config.accum_addr_width)
         self.accum_mode_in = WireVector(1)
@@ -117,7 +161,7 @@ class Accelerator:
         self.act_func_in = WireVector(1)  # Apply activation function or passthrough
     def _create_pipeline_registers(self):
-        num_registers = self.config.array_size + 1 + int(self.config.pipeline)
         self.accum_addr_regs = [
             Register(self.config.accum_addr_width) for _ in range(num_registers)
@@ -153,16 +197,11 @@ class Accelerator:
         self._create_pipeline_registers()
         # Connect buffer to external inputs
-        self.fifo.connect_inputs(
-            start=self.weight_start_in,
-            tile_addr=self.weight_tile_addr_in,
-        )
         self.systolic_array.connect_inputs(
             data_inputs=self.data_ins,
             enable_input=self.data_enable,
-            weight_inputs=self.fifo.outputs.weights,
-            weight_enable=self.fifo.outputs.active,
         )
         # Connect accumulator to systolic array
@@ -188,8 +227,8 @@ class Accelerator:
         self,
         data_enable: WireVector | None = None,
         data_inputs: list[WireVector] | None = None,
-        weight_start: WireVector | None = None,
-        weight_tile_addr: WireVector | None = None,
         accum_addr: WireVector | None = None,
         accum_mode: WireVector | None = None,
         act_start: WireVector | None = None,
@@ -204,9 +243,8 @@ class Accelerator:
         Args:
             data_enable: 1-bit signal that enables data flow into the systolic array
             data_inputs: List of input data wires for the systolic array. Must match array_size
-            weight_start: 1-bit signal that triggers loading of a new weight tile when pulsed high
-            weight_tile_addr: Address selecting which weight tile to load from the FIFO.
-                            Width must match the FIFO's tile address width
             accum_addr: Address for the accumulator memory bank. Width must match accum_addr_width
             accum_mode: 1-bit mode select (0=overwrite, 1=accumulate with existing values)
             act_start: 1-bit signal to enable passing data through the activation unit
@@ -226,22 +264,27 @@ class Accelerator:
                 f"Expected {self.config.array_size}, got {len(data_inputs)}"
             )
             for i, wire in enumerate(data_inputs):
-                assert len(wire) == self.config.data_type.bitwidth(), (
                     f"Data input width mismatch. "
-                    f"Expected {self.config.data_type.bitwidth()}, got {len(wire)}"
                 )
                 self.data_ins[i] <<= wire
-        if weight_start is not None:
-            assert len(weight_start) == 1, "Weight start signal must be 1 bit wide"
-            self.weight_start_in <<= weight_start
-        if weight_tile_addr is not None:
-            assert len(weight_tile_addr) == self.fifo.tile_addr_width, (
-                f"Weight tile address width mismatch. "
-                f"Expected {self.fifo.tile_addr_width}, got {len(weight_tile_addr)}"
             )
-            self.weight_tile_addr_in <<= weight_tile_addr
         if accum_addr is not None:
             assert len(accum_addr) == self.config.accum_addr_width, (
@@ -278,12 +321,8 @@ class Accelerator:
             assert len(valid) == 1, "Output valid signal must be a single bit wire"
             valid <<= self.activation.outputs_valid
-    def inspect_systolic_array_state(self, sim: Simulation):
-        """Return current PE array state"""
-        return self.systolic_array.get_state(sim)
-    def inspect_accumulator_state(self, sim: Simulation) -> np.ndarray:
-        """Return all accumulator tiles as 3D array.
         Args:
             sim: PyRTL simulation instance
@@ -296,18 +335,237 @@ class Accelerator:
         tiles = []
         for addr in range(2**self.config.accum_addr_width):
             row = [
-                float(self.config.accum_type(binint=sim.inspect_mem(bank).get(addr, 0)))
                 for bank in self.accumulator.memory_banks
             ]
             tiles.append(row)
         return np.array(tiles)
-class CompiledAccelerator:
     def __init__(self, config: AcceleratorConfig):
         self.config = config
         # Instantiate hardware components
         self.systolic_array = SystolicArrayDiP(
             size=config.array_size,
             data_type=config.data_type,
@@ -342,11 +600,8 @@ class CompiledAccelerator:
             for _ in range(self.config.array_size)
         ]
-        self.weight_enable = WireVector(1)
-        self.weights_in = [
-            WireVector(self.config.weight_type.bitwidth())
-            for _ in range(self.config.array_size)
-        ]
         self.accum_addr_in = WireVector(self.config.accum_addr_width)
         self.accum_mode_in = WireVector(1)
@@ -367,23 +622,33 @@ class CompiledAccelerator:
         self.accum_mode_out = WireVector(1)
         self.accum_mode_out <<= self.accum_mode_regs[-1]
-        self.act_control_regs = [Register(2) for _ in range(num_registers)]
-        self.act_control_regs[0].next <<= concat(self.act_start_in, self.act_func_in)
         self.accum_addr_regs[0].next <<= self.accum_addr_in
         self.accum_mode_regs[0].next <<= self.accum_mode_in
         for i in range(1, len(self.accum_addr_regs)):
             self.accum_addr_regs[i].next <<= self.accum_addr_regs[i - 1]
             self.accum_mode_regs[i].next <<= self.accum_mode_regs[i - 1]
-            self.act_control_regs[i].next <<= self.act_control_regs[i - 1]
         self.act_addr = Register(self.config.accum_addr_width)
         self.act_func = Register(1)
         self.act_start = Register(1)
         self.act_addr.next <<= self.accum_addr_out
-        self.act_func.next <<= self.act_control_regs[-1][0]
-        self.act_start.next <<= self.act_control_regs[-1][1]
     def _connect_components(self):
         """Internal component connections"""
@@ -391,11 +656,16 @@ class CompiledAccelerator:
         self._create_pipeline_registers()
         # Connect buffer to external inputs
         self.systolic_array.connect_inputs(
             data_inputs=self.data_ins,
             enable_input=self.data_enable,
-            weight_inputs=self.weights_in,
-            weight_enable=self.weight_enable,
         )
         # Connect accumulator to systolic array
@@ -421,8 +691,8 @@ class CompiledAccelerator:
         self,
         data_enable: WireVector | None = None,
         data_inputs: list[WireVector] | None = None,
-        weight_enable: WireVector | None = None,
-        weights_in: list[WireVector] | None = None,
         accum_addr: WireVector | None = None,
         accum_mode: WireVector | None = None,
         act_start: WireVector | None = None,
@@ -437,8 +707,9 @@ class CompiledAccelerator:
         Args:
             data_enable: 1-bit signal that enables data flow into the systolic array
             data_inputs: List of input data wires for the systolic array. Must match array_size
-            weight_enable: 1-bit signal enable writing new weights to systolic array registers
-            weights_in: List of input weight wires for the systolic array. Must match array_size
             accum_addr: Address for the accumulator memory bank. Width must match accum_addr_width
             accum_mode: 1-bit mode select (0=overwrite, 1=accumulate with existing values)
             act_start: 1-bit signal to enable passing data through the activation unit
@@ -464,21 +735,16 @@ class CompiledAccelerator:
                 )
                 self.data_ins[i] <<= wire
-        if weight_enable is not None:
-            assert len(weight_enable) == 1, "Weight start signal must be 1 bit wide"
-            self.weight_enable <<= weight_enable
-        if weights_in is not None:
-            assert len(weights_in) == self.config.array_size, (
-                f"Weights input list length must match array size. "
-                f"Expected {self.config.array_size}, got {len(weights_in)}"
             )
-            for i, wire in enumerate(weights_in):
-                assert len(wire) == self.config.weight_type.bitwidth(), (
-                    f"Weight input wire width mismatch. "
-                    f"Expected {self.config.weight_type.bitwidth()}, got {len(wire)}"
-                )
-                self.weights_in[i] <<= wire
         if accum_addr is not None:
             assert len(accum_addr) == self.config.accum_addr_width, (
@@ -515,6 +781,34 @@ class CompiledAccelerator:
             assert len(valid) == 1, "Output valid signal must be a single bit wire"
             valid <<= self.activation.outputs_valid
 @dataclass
 class TiledAcceleratorConfig:

+from __future__ import annotations
 from dataclasses import dataclass
+from typing import Callable, Literal, Type, Dict
 import numpy as np
+import hashlib
+import json
 from pyrtl import (
     WireVector,
     Register,
+    Input,
     Output,
     Simulation,
+    CompiledSimulation,
     concat,
 )
+from .adders import float_adder
+from ..dtypes.bfloat16 import BF16
+from .multipliers import *
+from .adders import *
+from .lmul import *
 from .buffer import BufferMemory, WeightFIFO
 from .systolic import SystolicArrayDiP
 from .accumulators import Accumulator, TiledAccumulatorMemoryBank
+from .activations import ReluState, ReluUnit
 from ..dtypes import BaseFloat
+from dataclasses import dataclass
+@dataclass  # (frozen=True)
+class CompiledAcceleratorConfig:
+    """Configuration for a compiled accelerator."""
     array_size: int
+    activation_type: Type[BaseFloat]
     weight_type: Type[BaseFloat]
+    multiplier: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
+    accum_addr_width: int = 12  # 4096 accumulator slots
+    pipeline_pe: bool = False
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        # Ensure activation dtype has bitwidth >= weight dtype
+        if self.activation_type.bitwidth() < self.weight_type.bitwidth():
+            raise ValueError(
+                f"Activation dtype bitwidth ({self.activation_type.bitwidth()}) must be greater than or equal to "
+                f"weight dtype bitwidth ({self.weight_type.bitwidth()})"
+            )
+    @property
+    def name(self):
+        dtype_name = lambda d: d.bitwidth() if d != BF16 else "b16"
+        lmul = "-lmul" if "lmul" in self.multiplier.__name__.lower() else ""
+        mem = f"-m{self.accum_addr_width}" if self.accum_addr_width != 12 else ""
+        return (
+            f"w{dtype_name(self.weight_type)}"
+            f"a{dtype_name(self.activation_type)}"
+            f"-{self.array_size}x{self.array_size}"
+            f"{lmul}"
+            f"{'-p' if self.pipeline_pe else ''}"
+            f"{mem}"
+        )
+    def __repr__(self) -> str:
+        return (
+            "CompiledAcceleratorConfig(\n"
+            f"\tarray_size={self.array_size}\n"
+            f"\tactivation_type={self.activation_type.__name__}\n"
+            f"\tweight_type={self.weight_type.__name__}\n"
+            f"\tmultiplier={self.multiplier.__name__}\n"
+            f"\taccum_addr_width={self.accum_addr_width}\n"
+            f"\tpipeline={self.pipeline_pe}\n"
+            # f'\tname="{self.name}"\n'
+            ")"
+        )
+    def __hash__(self) -> int:
+        """Generate a consistent hash value for this configuration.
+        Returns:
+            An integer hash value.
+        """
+        # Create a dictionary of the key configuration parameters
+        config_dict = {
+            "array_size": self.array_size,
+            "activation_type": f"{self.activation_type.__module__}.{self.activation_type.__name__}",
+            "weight_type": f"{self.weight_type.__module__}.{self.weight_type.__name__}",
+            "multiplier": self.multiplier.__name__,
+            "accum_addr_width": self.accum_addr_width,
+            "pipeline": self.pipeline_pe,
+        }
+        # Generate a hash from the sorted JSON representation
+        hash_str = hashlib.sha256(
+            json.dumps(config_dict, sort_keys=True).encode()
+        ).hexdigest()
+        # Convert the first 16 characters of the hex string to an integer
+        return int(hash_str[:16], 16)
     @property
+    def id(self):
+        """Get a unique hexadecimal identifier for this configuration."""
+        return hex(self.__hash__())[2:]
+class CompiledAccelerator:
+    def __init__(self, config: CompiledAcceleratorConfig):
         self.config = config
         # Instantiate hardware components
         self.systolic_array = SystolicArrayDiP(
             size=config.array_size,
+            data_type=config.activation_type,
             weight_type=config.weight_type,
+            accum_type=config.activation_type,
+            multiplier=config.multiplier,
+            adder=float_adder,
+            pipeline=config.pipeline_pe,
         )
         self.accumulator = Accumulator(
+            addr_width=12,
             array_size=config.array_size,
+            data_type=config.activation_type,
+            adder=float_adder,
         )
         self.activation = ReluUnit(
             size=config.array_size,
+            dtype=config.activation_type,
         )
         self.outputs = [
+            WireVector(config.activation_type.bitwidth())
+            for _ in range(config.array_size)
         ]
         # Connect components
         """Create unnamed WireVectors for control signals"""
         self.data_enable = WireVector(1)
         self.data_ins = [
+            WireVector(self.config.activation_type.bitwidth())
             for _ in range(self.config.array_size)
         ]
+        self.weight_enable = WireVector(1)
+        self.weights_in = [
+            WireVector(self.config.weight_type.bitwidth())
+            for _ in range(self.config.array_size)
+        ]
         self.accum_addr_in = WireVector(self.config.accum_addr_width)
         self.accum_mode_in = WireVector(1)
         self.act_func_in = WireVector(1)  # Apply activation function or passthrough
     def _create_pipeline_registers(self):
+        num_registers = self.config.array_size + 1 + int(self.config.pipeline_pe)
         self.accum_addr_regs = [
             Register(self.config.accum_addr_width) for _ in range(num_registers)
         self._create_pipeline_registers()
         # Connect buffer to external inputs
         self.systolic_array.connect_inputs(
             data_inputs=self.data_ins,
             enable_input=self.data_enable,
+            weight_inputs=self.weights_in,
+            weight_enable=self.weight_enable,
         )
         # Connect accumulator to systolic array
         self,
         data_enable: WireVector | None = None,
         data_inputs: list[WireVector] | None = None,
+        weight_enable: WireVector | None = None,
+        weights_in: list[WireVector] | None = None,
         accum_addr: WireVector | None = None,
         accum_mode: WireVector | None = None,
         act_start: WireVector | None = None,
         Args:
             data_enable: 1-bit signal that enables data flow into the systolic array
             data_inputs: List of input data wires for the systolic array. Must match array_size
+            weight_enable: 1-bit signal enable writing new weights to systolic array registers
+            weights_in: List of input weight wires for the systolic array. Must match array_size
             accum_addr: Address for the accumulator memory bank. Width must match accum_addr_width
             accum_mode: 1-bit mode select (0=overwrite, 1=accumulate with existing values)
             act_start: 1-bit signal to enable passing data through the activation unit
                 f"Expected {self.config.array_size}, got {len(data_inputs)}"
             )
             for i, wire in enumerate(data_inputs):
+                assert len(wire) == self.config.activation_type.bitwidth(), (
                     f"Data input width mismatch. "
+                    f"Expected {self.config.activation_type.bitwidth()}, got {len(wire)}"
                 )
                 self.data_ins[i] <<= wire
+        if weight_enable is not None:
+            assert len(weight_enable) == 1, "Weight start signal must be 1 bit wide"
+            self.weight_enable <<= weight_enable
+        if weights_in is not None:
+            assert len(weights_in) == self.config.array_size, (
+                f"Weights input list length must match array size. "
+                f"Expected {self.config.array_size}, got {len(weights_in)}"
             )
+            for i, wire in enumerate(weights_in):
+                assert len(wire) == self.config.weight_type.bitwidth(), (
+                    f"Weight input wire width mismatch. "
+                    f"Expected {self.config.weight_type.bitwidth()}, got {len(wire)}"
+                )
+                self.weights_in[i] <<= wire
         if accum_addr is not None:
             assert len(accum_addr) == self.config.accum_addr_width, (
             assert len(valid) == 1, "Output valid signal must be a single bit wire"
             valid <<= self.activation.outputs_valid
+    def inspect_accumulator_state(self, sim: CompiledSimulation) -> np.ndarray:
+        """Return accumulator memory as an array.
         Args:
             sim: PyRTL simulation instance
         tiles = []
         for addr in range(2**self.config.accum_addr_width):
             row = [
+                float(
+                    self.config.activation_type(
+                        binint=sim.inspect_mem(bank).get(addr, 0)
+                    )
+                )
                 for bank in self.accumulator.memory_banks
             ]
             tiles.append(row)
         return np.array(tiles)
+@dataclass
+class AcceleratorAnalysisConfig:
+    """Configuration for an accelerator to be generated for analysis."""
+    array_size: int
+    """
+    The size of the systolic array (N x N).
+    Determines the number of processing elements in the accelerator.
+    """
+    weight_type: Type[BaseFloat]
+    """
+    The floating-point data type for weights.
+    Must be a subclass of BaseFloat (e.g., Float8, BF16, Float32).
+    """
+    activation_type: Type[BaseFloat]
+    """
+    The floating-point data type for activations/inputs.
+    Must be a subclass of BaseFloat (e.g., Float8, BF16, Float32).
+    """
+    lmul: bool
+    """
+    Whether to use L-mul for multiplication operations.
+    If True, uses linear-time multipliers; if False, uses standard IEEE multipliers.
+    """
+    pipeline_level: Literal["low", "high"] | None
+    """
+    The level of pipelining in the accelerator:
+    - None: No pipelining (fully combinational design)
+    - 'low': Basic pipelining between multiplier and adder in each PE
+    - 'high': Full pipelining with pipelined arithmetic units
+    """
+    use_fast_internals: bool
+    """
+    Whether to use faster basic arithmetic implementations with more complex low-level RTL.
+    - True: uses optimized arithmetic units from PyRTL's rtllib
+    - False: prioritize simplicity over speed
+    WARNING: Setting to True could potentially make final synthesis on the Verilog output worse as the synthesis tools will not be able to infer optimal circuits from the complex low-level RTL.
+    """
+    accum_addr_width: int = 12
+    """
+    The bit width of the accumulator address.
+    Determines the size of the accumulator memory (2^width entries).
+    Default is 12 bits (4096 entries).
+    """
+    def __post_init__(self):
+        # Ensure activation dtype has bitwidth >= weight dtype
+        if self.activation_type.bitwidth() < self.weight_type.bitwidth():
+            raise ValueError(
+                f"Activation dtype bitwidth ({self.activation_type.bitwidth()}) must be greater than or equal to "
+                f"weight dtype bitwidth ({self.weight_type.bitwidth()})"
+            )
+        # Determine if we should use pipelined arithmetic functions
+        use_pipelined_funcs = self.pipeline_level == "high"
+        # Set pipeline_pe flag for PE configuration
+        # True if any pipeline level is specified (low or high)
+        self.pipeline_pe = self.pipeline_level is not None
+        # Multiplier function selection using dictionary mapping
+        multiplier_map = {
+            # (lmul, use_pipelined_funcs, fast_internals) -> function
+            (True, True, True): lmul_pipelined_fast,
+            (True, True, False): lmul_pipelined,
+            (True, False, True): lmul_fast,
+            (True, False, False): lmul_simple,
+            (False, True, True): float_multiplier_pipelined_fast_unstable,
+            (False, True, False): float_multiplier_pipelined,
+            (False, False, True): float_multiplier_fast_unstable,
+            (False, False, False): float_multiplier,
+        }
+        # Adder function selection using dictionary mapping
+        adder_map = {
+            # (use_pipelined_funcs, fast_internals) -> function
+            (True, True): float_adder_pipelined_fast_unstable,
+            (True, False): float_adder_pipelined,
+            (False, True): float_adder_fast_unstable,
+            (False, False): float_adder,
+        }
+        # Select functions using the maps
+        self.multiplier_func = multiplier_map[
+            (self.lmul, use_pipelined_funcs, self.use_fast_internals)
+        ]
+        self.adder_func = adder_map[(use_pipelined_funcs, self.use_fast_internals)]
+    @property
+    def name(self):
+        dtype_name = lambda d: d.bitwidth() if d != BF16 else "b16"
+        mul = "-lmul" if self.lmul else "-ieee"
+        pipe_name_map = {"low": "-pipePE", "high": "-pipeALL"}
+        fast = "-fast" if self.use_fast_internals else ""
+        mem = f"-m{self.accum_addr_width}" if self.accum_addr_width != 12 else ""
+        return (
+            f"w{dtype_name(self.weight_type)}"
+            f"a{dtype_name(self.activation_type)}"
+            f"-{self.array_size}x{self.array_size}"
+            + mem
+            + mul
+            + fast
+            + pipe_name_map.get(self.pipeline_level, "")  # type: ignore
+        )
+class AcceleratorTopLevel(CompiledAccelerator):
+    def __init__(self, config: AcceleratorAnalysisConfig):
+        self.config = config
+        # Instantiate hardware components
+        self.systolic_array = SystolicArrayDiP(
+            size=config.array_size,
+            data_type=config.activation_type,
+            weight_type=config.weight_type,
+            accum_type=config.activation_type,
+            multiplier=config.multiplier_func,
+            adder=config.adder_func,
+            pipeline=config.pipeline_pe,
+        )
+        self.accumulator = Accumulator(
+            addr_width=12,
+            array_size=config.array_size,
+            data_type=config.activation_type,
+            adder=config.adder_func,
+        )
+        self.activation = ReluUnit(
+            size=config.array_size,
+            dtype=config.activation_type,
+        )
+        self.outputs = [
+            Output(config.activation_type.bitwidth(), f"out_{i}")
+            for i in range(config.array_size)
+        ]
+        # Connect everything together and create io ports
+        self._connect_components()
+        self.valid_out = Output(1, "valid_out")
+        self.valid_out <<= self.activation.outputs_valid
+    def _create_control_wires(self):
+        """Create named Input wires for control signals"""
+        self.data_enable = Input(1, "data_enable")
+        self.data_ins = [
+            Input(self.config.activation_type.bitwidth(), f"data_in_{i}")
+            for i in range(self.config.array_size)
+        ]
+        self.weight_enable = Input(1, "weight_enable")
+        self.weights_in = [
+            Input(self.config.weight_type.bitwidth(), f"weight_in_{i}")
+            for i in range(self.config.array_size)
+        ]
+        self.accum_addr_in = Input(self.config.accum_addr_width, "accum_addr_in")
+        self.accum_mode_in = Input(1, "accum_mode_in")
+        self.act_start_in = Input(1, "act_start_in")
+        self.act_func_in = Input(1, "act_func_in")
+@dataclass(unsafe_hash=True)
+class AcceleratorConfig:
+    """Configuration class for a systolic array accelerator.
+    This class defines the parameters and specifications for a systolic array
+    accelerator including array dimensions, data types, arithmetic operations,
+    and memory configuration.
+    """
+    array_size: int
+    """Dimension of systolic array (always square)"""
+    num_weight_tiles: int
+    """Number of weight tiles in the FIFO. Each tile is equal to the size of the systolic array"""
+    data_type: Type[BaseFloat]
+    """Floating point format of input data to systolic array"""
+    weight_type: Type[BaseFloat]
+    """Floating point format of weight inputs"""
+    accum_type: Type[BaseFloat]
+    """Floating point format to accumulate values in"""
+    pe_adder: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
+    """Function to generate adder hardware for the processing elements"""
+    accum_adder: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
+    """Function to generate adder hardware for the accumulator buffer"""
+    pe_multiplier: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
+    """Function to generate multiplier hardware for the processing elements"""
+    pipeline: bool
+    """Whether to add a pipeline stage in processing elements between multiplier and adder"""
+    accum_addr_width: int
+    """Address width for accumulator memory. Determines number of individually addressable locations"""
+    @property
+    def weight_tile_addr_width(self):
+        """Get the width of the weight tile address bus in bits"""
+        return (self.num_weight_tiles - 1).bit_length()
+class Accelerator:
     def __init__(self, config: AcceleratorConfig):
         self.config = config
         # Instantiate hardware components
+        self.fifo = WeightFIFO(
+            array_size=config.array_size,
+            num_tiles=config.num_weight_tiles,
+            dtype=config.weight_type,
+        )
         self.systolic_array = SystolicArrayDiP(
             size=config.array_size,
             data_type=config.data_type,
             for _ in range(self.config.array_size)
         ]
+        self.weight_start_in = WireVector(1)
+        self.weight_tile_addr_in = WireVector(self.fifo.tile_addr_width)
         self.accum_addr_in = WireVector(self.config.accum_addr_width)
         self.accum_mode_in = WireVector(1)
         self.accum_mode_out = WireVector(1)
         self.accum_mode_out <<= self.accum_mode_regs[-1]
+        self.act_start_regs = [Register(1) for _ in range(num_registers)]
+        self.act_enable_regs = [Register(1) for _ in range(num_registers)]
+        self.act_start_regs[0].next <<= self.act_start_in
+        self.act_enable_regs[0].next <<= self.act_func_in
+        # self.act_control_regs = [Register(2) for _ in range(num_registers)]
+        # self.act_control_regs[0].next <<= concat(self.act_start_in, self.act_func_in)
         self.accum_addr_regs[0].next <<= self.accum_addr_in
         self.accum_mode_regs[0].next <<= self.accum_mode_in
         for i in range(1, len(self.accum_addr_regs)):
             self.accum_addr_regs[i].next <<= self.accum_addr_regs[i - 1]
             self.accum_mode_regs[i].next <<= self.accum_mode_regs[i - 1]
+            # self.act_control_regs[i].next <<= self.act_control_regs[i - 1]
+            if i < len(self.act_start_regs):
+                self.act_enable_regs[i].next <<= self.act_enable_regs[i - 1]
+                self.act_start_regs[i].next <<= self.act_start_regs[i - 1]
         self.act_addr = Register(self.config.accum_addr_width)
         self.act_func = Register(1)
         self.act_start = Register(1)
         self.act_addr.next <<= self.accum_addr_out
+        # self.act_func.next <<= self.act_control_regs[-1][0]
+        # self.act_start.next <<= self.act_control_regs[-1][1]
+        self.act_func.next <<= self.act_enable_regs[-1]
+        self.act_start.next <<= self.act_start_regs[-1]
     def _connect_components(self):
         """Internal component connections"""
         self._create_pipeline_registers()
         # Connect buffer to external inputs
+        self.fifo.connect_inputs(
+            start=self.weight_start_in,
+            tile_addr=self.weight_tile_addr_in,
+        )
         self.systolic_array.connect_inputs(
             data_inputs=self.data_ins,
             enable_input=self.data_enable,
+            weight_inputs=self.fifo.outputs.weights,
+            weight_enable=self.fifo.outputs.active,
         )
         # Connect accumulator to systolic array
         self,
         data_enable: WireVector | None = None,
         data_inputs: list[WireVector] | None = None,
+        weight_start: WireVector | None = None,
+        weight_tile_addr: WireVector | None = None,
         accum_addr: WireVector | None = None,
         accum_mode: WireVector | None = None,
         act_start: WireVector | None = None,
         Args:
             data_enable: 1-bit signal that enables data flow into the systolic array
             data_inputs: List of input data wires for the systolic array. Must match array_size
+            weight_start: 1-bit signal that triggers loading of a new weight tile when pulsed high
+            weight_tile_addr: Address selecting which weight tile to load from the FIFO.
+                            Width must match the FIFO's tile address width
             accum_addr: Address for the accumulator memory bank. Width must match accum_addr_width
             accum_mode: 1-bit mode select (0=overwrite, 1=accumulate with existing values)
             act_start: 1-bit signal to enable passing data through the activation unit
                 )
                 self.data_ins[i] <<= wire
+        if weight_start is not None:
+            assert len(weight_start) == 1, "Weight start signal must be 1 bit wide"
+            self.weight_start_in <<= weight_start
+        if weight_tile_addr is not None:
+            assert len(weight_tile_addr) == self.fifo.tile_addr_width, (
+                f"Weight tile address width mismatch. "
+                f"Expected {self.fifo.tile_addr_width}, got {len(weight_tile_addr)}"
             )
+            self.weight_tile_addr_in <<= weight_tile_addr
         if accum_addr is not None:
             assert len(accum_addr) == self.config.accum_addr_width, (
             assert len(valid) == 1, "Output valid signal must be a single bit wire"
             valid <<= self.activation.outputs_valid
+    def inspect_systolic_array_state(self, sim: Simulation):
+        """Return current PE array state"""
+        return self.systolic_array.get_state(sim)
+    def inspect_accumulator_state(self, sim: Simulation) -> np.ndarray:
+        """Return all accumulator tiles as 3D array.
+        Args:
+            sim: PyRTL simulation instance
+        Returns:
+            2D numpy array of shape (2**accum_addr_width, array_size) containing
+            all accumulator tile data converted to floating point values.
+            Each tile contains array_size rows with array_size columns.
+        """
+        tiles = []
+        for addr in range(2**self.config.accum_addr_width):
+            row = [
+                float(self.config.accum_type(binint=sim.inspect_mem(bank).get(addr, 0)))
+                for bank in self.accumulator.memory_banks
+            ]
+            tiles.append(row)
+        return np.array(tiles)
+    def inspect_activation_state(self, sim: Simulation) -> ReluState:
+        """Return current activation unit state"""
+        return self.activation.inspect_state(sim)
 @dataclass
 class TiledAcceleratorConfig:

hardware_accelerators/rtllib/activations.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from typing import Sequence, Type
 from pyrtl import (
     WireVector,
     Input,
@@ -13,6 +15,32 @@ from pyrtl import (
 from ..dtypes.base import BaseFloat
 class ReluUnit:
     def __init__(self, size: int, dtype: Type[BaseFloat]):
         self.size = size
@@ -21,18 +49,24 @@ class ReluUnit:
         # Control signals
         self.start = WireVector(1)  # trigger to latch new enable value
         self.enable_in = WireVector(1)  # input enable value to latch
-        self.inputs_valid = WireVector(1)  # indicates if inputs are valid
         self.enable_reg = Register(1)  # stateful enable register
         # Input and output data
-        self.data = [WireVector(dtype.bitwidth()) for _ in range(size)]
-        self.outputs = [self.relu(x) for x in self.data]
         self.outputs_valid = WireVector(1)
-        self.outputs_valid <<= self.inputs_valid
     def relu(self, x: WireVector):
         # Use enable_reg instead of enable wire
-        pass_condition = self.inputs_valid & (
             ~self.enable_reg | (self.enable_reg & ~x[-1])
         )
         return select(pass_condition, x, Const(0, self.dtype.bitwidth()))
@@ -48,7 +82,7 @@ class ReluUnit:
             len(inputs) == self.size
         ), f"Activation module input size mismatch. Expected {self.size}, got {len(inputs)}"
         for i in range(self.size):
-            self.data[i] <<= inputs[i]
         self.inputs_valid <<= valid
         self.enable_in <<= enable
         self.start <<= start
@@ -85,6 +119,34 @@ class ReluUnit:
         """
         return [float(self.dtype(binint=sim.inspect(out.name))) for out in self.outputs]
 # class ReluUnit:
 #     def __init__(self, size: int, dtype: Type[BaseFloat]):

+from dataclasses import dataclass
+import numpy as np
+from typing import TYPE_CHECKING, Sequence, Type
 from pyrtl import (
     WireVector,
     Input,
 from ..dtypes.base import BaseFloat
+@dataclass
+class ReluState:
+    start: int
+    enable_in: int
+    enable_reg: int
+    inputs_valid: int
+    inputs: np.ndarray
+    registers: np.ndarray
+    outputs_valid: int
+    outputs: np.ndarray
+    def __repr__(self) -> str:
+        """Pretty print the ReLU state"""
+        status = "enabled" if self.enable_reg else "disabled"
+        valid_str = "(valid)" if self.outputs_valid else "(invalid)"
+        return (
+            f"ReLU {status} {valid_str}\n"
+            f"  Control: start={self.start}, enable_in={self.enable_in}, "
+            f"enable_reg={self.enable_reg}, inputs_valid={self.inputs_valid}\n"
+            f"  Inputs: {np.array2string(self.inputs, precision=4, suppress_small=True)}\n"
+            f"  Registers: {np.array2string(self.registers, precision=4, suppress_small=True)}\n"
+            f"  Outputs: {np.array2string(self.outputs, precision=4, suppress_small=True)}"
+        )
 class ReluUnit:
     def __init__(self, size: int, dtype: Type[BaseFloat]):
         self.size = size
         # Control signals
         self.start = WireVector(1)  # trigger to latch new enable value
         self.enable_in = WireVector(1)  # input enable value to latch
         self.enable_reg = Register(1)  # stateful enable register
+        self.inputs_valid = WireVector(1)  # indicates if inputs are valid
+        self.valid_reg = Register(1)  # stateful valid register
+        self.valid_reg.next <<= self.inputs_valid
         # Input and output data
+        self.data_in = [WireVector(dtype.bitwidth()) for _ in range(size)]
+        self.data_regs = [Register(dtype.bitwidth()) for _ in range(size)]
+        for data, reg in zip(self.data_in, self.data_regs):
+            reg.next <<= data
+        self.outputs = [self.relu(x) for x in self.data_regs]
         self.outputs_valid = WireVector(1)
+        self.outputs_valid <<= self.valid_reg
     def relu(self, x: WireVector):
         # Use enable_reg instead of enable wire
+        pass_condition = self.valid_reg & (
             ~self.enable_reg | (self.enable_reg & ~x[-1])
         )
         return select(pass_condition, x, Const(0, self.dtype.bitwidth()))
             len(inputs) == self.size
         ), f"Activation module input size mismatch. Expected {self.size}, got {len(inputs)}"
         for i in range(self.size):
+            self.data_in[i] <<= inputs[i]
         self.inputs_valid <<= valid
         self.enable_in <<= enable
         self.start <<= start
         """
         return [float(self.dtype(binint=sim.inspect(out.name))) for out in self.outputs]
+    def inspect_state(self, sim: Simulation) -> ReluState:
+        """Inspect current state of the ReLU unit."""
+        return ReluState(
+            start=sim.inspect(self.start.name),
+            enable_in=sim.inspect(self.enable_in.name),
+            enable_reg=sim.inspect(self.enable_reg.name),
+            inputs_valid=sim.inspect(self.inputs_valid.name),
+            inputs=np.array(
+                [
+                    float(self.dtype(binint=sim.inspect(inp.name)))
+                    for inp in self.data_in
+                ]
+            ),
+            registers=np.array(
+                [
+                    float(self.dtype(binint=sim.inspect(reg.name)))
+                    for reg in self.data_regs
+                ]
+            ),
+            outputs_valid=sim.inspect(self.outputs_valid.name),
+            outputs=np.array(
+                [
+                    float(self.dtype(binint=sim.inspect(out.name)))
+                    for out in self.outputs
+                ]
+            ),
+        )
 # class ReluUnit:
 #     def __init__(self, size: int, dtype: Type[BaseFloat]):

hardware_accelerators/rtllib/adders.py CHANGED Viewed

@@ -17,6 +17,7 @@ def float_adder(
     float_a: WireVector,
     float_b: WireVector,
     dtype: Type[BaseFloat],
 ) -> WireVector:
     e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
@@ -26,7 +27,7 @@ def float_adder(
     )
     sign_xor, exp_larger, signed_shift, mant_smaller, mant_larger = adder_stage_2(
-        sign_a, sign_b, exp_a, exp_b, mantissa_a, mantissa_b, e_bits, m_bits
     )
     abs_shift = WireVector(e_bits)  # , "abs_shift")
@@ -37,7 +38,7 @@ def float_adder(
     )
     mantissa_sum, is_neg, lzc = adder_stage_4(
-        aligned_mant_msb, mant_larger, sign_xor, m_bits
     )
     final_sign, final_exp, norm_mantissa = adder_stage_5(
@@ -56,13 +57,52 @@ def float_adder(
     )
     float_result = WireVector(dtype.bitwidth())  # , "float_result")
-    float_result <<= pyrtl.concat(final_sign, final_exp, norm_mantissa)
     return float_result
 ### ===================================================================
 ### Simple Pipeline Design
 ### ===================================================================
 class FloatAdderPipelined(SimplePipeline):
@@ -72,6 +112,7 @@ class FloatAdderPipelined(SimplePipeline):
         float_b: WireVector,
         w_en: WireVector,
         dtype: Type[BaseFloat],
     ):
         """
         Initialize a pipelined BFloat16 adder with write enable control.
@@ -134,17 +175,17 @@ class FloatAdderPipelined(SimplePipeline):
             write enable is not 1 bit
         """
         assert (
-            len(float_a) == len(float_b) == 16
         ), f"float inputs must be {dtype.bitwidth()} bits"
         assert len(w_en) == 1, "write enable signal must be 1 bit"
         self.e_bits, self.m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
         # Define inputs and outputs
         self._float_a, self._float_b = float_a, float_b
         self._write_enable = w_en
-        # self._result = pyrtl.Register(self.e_bits + self.m_bits + 1, 'result')
         self._result_out = pyrtl.WireVector(dtype.bitwidth())  # , "_result")
-        super(FloatAdderPipelined, self).__init__()
     @property
     def result(self):
@@ -183,6 +224,7 @@ class FloatAdderPipelined(SimplePipeline):
             self.mant_b,
             self.e_bits,
             self.m_bits,
         )
     def stage2(self):
@@ -219,7 +261,11 @@ class FloatAdderPipelined(SimplePipeline):
         # Perform mantissa addition and leading zero detection
         self.mant_sum, self.is_neg, self.lzc = adder_stage_4(
-            self.aligned_mant_msb, self.mant_larger, self.sign_xor, self.m_bits
         )
     def stage4(self):

     float_a: WireVector,
     float_b: WireVector,
     dtype: Type[BaseFloat],
+    fast: bool = False,
 ) -> WireVector:
     e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
     )
     sign_xor, exp_larger, signed_shift, mant_smaller, mant_larger = adder_stage_2(
+        sign_a, sign_b, exp_a, exp_b, mantissa_a, mantissa_b, e_bits, m_bits, fast
     )
     abs_shift = WireVector(e_bits)  # , "abs_shift")
     )
     mantissa_sum, is_neg, lzc = adder_stage_4(
+        aligned_mant_msb, mant_larger, sign_xor, m_bits, fast
     )
     final_sign, final_exp, norm_mantissa = adder_stage_5(
     )
     float_result = WireVector(dtype.bitwidth())  # , "float_result")
+    # Zero detection logic
+    a_is_zero = ~pyrtl.or_all_bits(float_a[:-1])
+    b_is_zero = ~pyrtl.or_all_bits(float_b[:-1])
+    with pyrtl.conditional_assignment:
+        with a_is_zero:
+            float_result |= float_b
+        with b_is_zero:
+            float_result |= float_a
+        with pyrtl.otherwise:
+            float_result |= pyrtl.concat(final_sign, final_exp, norm_mantissa)
     return float_result
+def float_adder_fast_unstable(
+    float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat]
+) -> WireVector:
+    return float_adder(float_a, float_b, dtype, fast=True)
 ### ===================================================================
 ### Simple Pipeline Design
 ### ===================================================================
+# TODO: add zero detection logic
+def float_adder_pipelined(
+    float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat], fast: bool = False
+) -> WireVector:
+    w_en = pyrtl.Input(1)
+    w_en.name = w_en.name.replace("tmp", "adder_w_en_in")
+    adder = FloatAdderPipelined(float_a, float_b, w_en, dtype, fast=fast)
+    return adder._result_out
+def float_adder_pipelined_fast_unstable(
+    float_a: WireVector,
+    float_b: WireVector,
+    dtype: Type[BaseFloat],
+) -> WireVector:
+    w_en = pyrtl.Input(1)
+    w_en.name = w_en.name.replace("tmp", "adder_w_en_in")
+    adder = FloatAdderPipelined(float_a, float_b, w_en, dtype, fast=True)
+    return adder._result_out
 class FloatAdderPipelined(SimplePipeline):
         float_b: WireVector,
         w_en: WireVector,
         dtype: Type[BaseFloat],
+        fast: bool = False,
     ):
         """
         Initialize a pipelined BFloat16 adder with write enable control.
             write enable is not 1 bit
         """
         assert (
+            len(float_a) == len(float_b) == dtype.bitwidth()
         ), f"float inputs must be {dtype.bitwidth()} bits"
         assert len(w_en) == 1, "write enable signal must be 1 bit"
+        self._fast = fast
         self.e_bits, self.m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
         # Define inputs and outputs
         self._float_a, self._float_b = float_a, float_b
         self._write_enable = w_en
+        # self._result = pyrtl.Register(self.e_bits + self.m_bits + 1, "result")
         self._result_out = pyrtl.WireVector(dtype.bitwidth())  # , "_result")
+        super().__init__("float_adder")
     @property
     def result(self):
             self.mant_b,
             self.e_bits,
             self.m_bits,
+            self._fast,
         )
     def stage2(self):
         # Perform mantissa addition and leading zero detection
         self.mant_sum, self.is_neg, self.lzc = adder_stage_4(
+            self.aligned_mant_msb,
+            self.mant_larger,
+            self.sign_xor,
+            self.m_bits,
+            self._fast,
         )
     def stage4(self):

hardware_accelerators/rtllib/legacy.py CHANGED Viewed

@@ -1,11 +1,82 @@
 import pyrtl
 from pyrtl.rtllib.adders import carrysave_adder, kogge_stone
 from .utils.lmul_utils import get_combined_offset
 ###########################
 # Old code below
 ###########################
 # BF16 Naive Combinatorial

+from typing import Type
 import pyrtl
+from pyrtl import WireVector
 from pyrtl.rtllib.adders import carrysave_adder, kogge_stone
+from ..dtypes.base import BaseFloat
 from .utils.lmul_utils import get_combined_offset
 ###########################
 # Old code below
 ###########################
+def lmul_simple(
+    float_a: WireVector,
+    float_b: WireVector,
+    dtype: Type[BaseFloat],
+):
+    """Linear time complexity float multiply unit in the simplest configuration."""
+    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
+    em_bits = e_bits + m_bits
+    sign_out = float_a[em_bits] ^ float_b[em_bits]
+    unsigned_offset = pyrtl.Const(get_combined_offset(e_bits, m_bits), em_bits)
+    result_sum = float_a[:em_bits] + float_b[:em_bits] - unsigned_offset
+    fp_out = WireVector(bitwidth=em_bits + 1)
+    fp_out <<= pyrtl.concat(sign_out, pyrtl.truncate(result_sum, em_bits))
+    return fp_out
+def lmul_fast(float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat]):
+    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
+    em_bits = e_bits + m_bits
+    sign_a = float_a[em_bits]
+    sign_b = float_b[em_bits]
+    exp_mantissa_a = float_a[:em_bits]
+    exp_mantissa_b = float_b[:em_bits]
+    fp_out = WireVector(em_bits + 1)
+    # Calculate result sign
+    result_sign = sign_a ^ sign_b
+    # Add exp_mantissa parts using kogge_stone adder (faster than ripple)
+    # exp_mantissa_sum = kogge_stone(exp_mantissa_a, exp_mantissa_b)
+    # Get the combined offset-bias constant
+    OFFSET_MINUS_BIAS = pyrtl.Const(
+        get_combined_offset(e_bits, m_bits, True), bitwidth=em_bits
+    )
+    # Add offset-bias value - this will be 8 bits including carry
+    # final_sum = kogge_stone(exp_mantissa_sum, OFFSET_MINUS_BIAS)
+    final_sum = carrysave_adder(
+        exp_mantissa_a, exp_mantissa_b, OFFSET_MINUS_BIAS, final_adder=kogge_stone
+    )
+    # Select result based on carry and MSB:
+    # carry=1: overflow -> 0x7F
+    # carry=0, msb=0: underflow -> 0x00
+    # carry=0, msb=1: normal -> result_bits
+    MAX_VALUE = pyrtl.Const(2**em_bits - 1, bitwidth=em_bits)  # , name="max_value")
+    if e_bits == 4 and m_bits == 3:
+        MAX_VALUE = pyrtl.Const(0x7F, 7)
+    mantissa_result = pyrtl.mux(
+        final_sum[em_bits:],
+        pyrtl.Const(0, bitwidth=em_bits),
+        final_sum[:em_bits],
+        default=MAX_VALUE,
+    )
+    # Combine sign and result
+    fp_out <<= pyrtl.concat(result_sign, mantissa_result)
+    return fp_out
 # BF16 Naive Combinatorial

hardware_accelerators/rtllib/lmul.py CHANGED Viewed

@@ -1,92 +1,92 @@
 from typing import Type
 import pyrtl
-from pyrtl import WireVector
-from pyrtl.rtllib.adders import carrysave_adder, kogge_stone
 from ..dtypes import BaseFloat, Float8
-from .utils.lmul_utils import get_combined_offset
-def lmul_simple(
-    float_a: WireVector,
-    float_b: WireVector,
-    dtype: Type[BaseFloat],
-):
-    """Linear time complexity float multiply unit in the simplest configuration."""
-    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
-    em_bits = e_bits + m_bits
-    sign_out = float_a[em_bits] ^ float_b[em_bits]
-    unsigned_offset = pyrtl.Const(get_combined_offset(e_bits, m_bits), em_bits)
-    result_sum = float_a[:em_bits] + float_b[:em_bits] - unsigned_offset
-    fp_out = WireVector(bitwidth=em_bits + 1)
-    fp_out <<= pyrtl.concat(sign_out, pyrtl.truncate(result_sum, em_bits))
-    return fp_out
-def lmul_fast(float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat]):
     e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
     em_bits = e_bits + m_bits
     sign_a = float_a[em_bits]
     sign_b = float_b[em_bits]
     exp_mantissa_a = float_a[:em_bits]
     exp_mantissa_b = float_b[:em_bits]
-    fp_out = WireVector(em_bits + 1)
-    # Calculate result sign
-    result_sign = sign_a ^ sign_b
-    # Add exp_mantissa parts using kogge_stone adder (faster than ripple)
-    # exp_mantissa_sum = kogge_stone(exp_mantissa_a, exp_mantissa_b)
-    # Get the combined offset-bias constant
-    OFFSET_MINUS_BIAS = pyrtl.Const(
-        get_combined_offset(e_bits, m_bits, True), bitwidth=em_bits
-    )
-    # Add offset-bias value - this will be 8 bits including carry
-    # final_sum = kogge_stone(exp_mantissa_sum, OFFSET_MINUS_BIAS)
-    final_sum = carrysave_adder(
-        exp_mantissa_a, exp_mantissa_b, OFFSET_MINUS_BIAS, final_adder=kogge_stone
-    )
-    # Select result based on carry and MSB:
-    # carry=1: overflow -> 0x7F
-    # carry=0, msb=0: underflow -> 0x00
-    # carry=0, msb=1: normal -> result_bits
-    MAX_VALUE = pyrtl.Const(2**em_bits - 1, bitwidth=em_bits)  # , name="max_value")
-    if e_bits == 4 and m_bits == 3:
-        MAX_VALUE = pyrtl.Const(0x7F, 7)
-    mantissa_result = pyrtl.mux(
-        final_sum[em_bits:],
-        pyrtl.Const(0, bitwidth=em_bits),
-        final_sum[:em_bits],
-        default=MAX_VALUE,
-    )
-    # Combine sign and result
-    fp_out <<= pyrtl.concat(result_sign, mantissa_result)
-    return fp_out
-# Float8 fast pipelined lmul
 class LmulPipelined:
     def __init__(
         self,
         float_a: WireVector,
         float_b: WireVector,
         dtype: Type[BaseFloat],
     ):
         self.e_bits = dtype.exponent_bits()
         self.m_bits = dtype.mantissa_bits()
         self.em_bits = dtype.bitwidth() - 1
         # Inputs and Outputs
         assert (
@@ -137,13 +137,16 @@ class LmulPipelined:
         # Calculate and register sign
         self.reg_sign.next <<= sign_a ^ sign_b
-        # First addition and register result
-        final_sum = carrysave_adder(
-            exp_mantissa_a,
-            exp_mantissa_b,
-            self.OFFSET_MINUS_BIAS,
-            final_adder=kogge_stone,
-        )
         self.reg_final_sum.next <<= final_sum

 from typing import Type
 import pyrtl
+from pyrtl import WireVector, conditional_assignment
+from pyrtl.rtllib.adders import carrysave_adder, kogge_stone, fast_group_adder
 from ..dtypes import BaseFloat, Float8
+from .utils.lmul_utils import get_combined_offset, lmul_offset_rtl
+def lmul(float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat], fast=False):
     e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()
     em_bits = e_bits + m_bits
     sign_a = float_a[em_bits]
     sign_b = float_b[em_bits]
+    exp_a = float_a[m_bits:-1]
+    exp_b = float_b[m_bits:-1]
     exp_mantissa_a = float_a[:em_bits]
     exp_mantissa_b = float_b[:em_bits]
+    zero_or_subnormal = WireVector(1)
+    final_sum = WireVector(em_bits + 2)
+    carry_msb = WireVector(2)
+    fp_out = WireVector(dtype.bitwidth())
+    OFFSET_MINUS_BIAS = lmul_offset_rtl(dtype)
+    MAX_VALUE = pyrtl.Const(dtype.binary_max(), bitwidth=em_bits)
+    if fast:
+        final_sum <<= carrysave_adder(
+            exp_mantissa_a, exp_mantissa_b, OFFSET_MINUS_BIAS, final_adder=kogge_stone
+        )
+    else:
+        final_sum <<= exp_mantissa_a + exp_mantissa_b + OFFSET_MINUS_BIAS
+    carry_msb <<= final_sum[em_bits:]
+    zero_or_subnormal <<= ~pyrtl.or_all_bits(exp_a) | ~pyrtl.or_all_bits(exp_b)
+    with conditional_assignment:
+        with zero_or_subnormal:
+            fp_out |= 0
+        with carry_msb == 0:
+            fp_out |= 0
+        with carry_msb == 1:
+            fp_out |= pyrtl.concat(sign_a ^ sign_b, final_sum[:em_bits])
+        with pyrtl.otherwise:
+            fp_out |= pyrtl.concat(sign_a ^ sign_b, MAX_VALUE)
+    return fp_out
+def lmul_simple(float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat]):
+    return lmul(float_a, float_b, dtype, fast=False)
+def lmul_fast(float_a: WireVector, float_b: WireVector, dtype: Type[BaseFloat]):
+    return lmul(float_a, float_b, dtype, fast=True)
+def lmul_pipelined(
+    float_a: WireVector,
+    float_b: WireVector,
+    dtype: Type[BaseFloat],
+) -> WireVector:
+    mult = LmulPipelined(float_a, float_b, dtype)
+    return mult.output_reg
+def lmul_pipelined_fast(
+    float_a: WireVector,
+    float_b: WireVector,
+    dtype: Type[BaseFloat],
+) -> WireVector:
+    mult = LmulPipelined(float_a, float_b, dtype, fast=True)
+    return mult.output_reg
 class LmulPipelined:
     def __init__(
         self,
         float_a: WireVector,
         float_b: WireVector,
         dtype: Type[BaseFloat],
+        fast: bool = False,
     ):
         self.e_bits = dtype.exponent_bits()
         self.m_bits = dtype.mantissa_bits()
         self.em_bits = dtype.bitwidth() - 1
+        self._fast = fast
         # Inputs and Outputs
         assert (
         # Calculate and register sign
         self.reg_sign.next <<= sign_a ^ sign_b
+        # Add the floating point numbers with special lmul offset
+        if self._fast:
+            final_sum = carrysave_adder(
+                exp_mantissa_a,
+                exp_mantissa_b,
+                self.OFFSET_MINUS_BIAS,
+                final_adder=kogge_stone,
+            )
+        else:
+            final_sum = exp_mantissa_a + exp_mantissa_b + self.OFFSET_MINUS_BIAS
         self.reg_final_sum.next <<= final_sum