|
|
from typing import Dict, Any, Optional
|
|
|
import numpy as np
|
|
|
|
|
|
class MemoryBlock:
|
|
|
"""Base class for GPU memory blocks"""
|
|
|
def __init__(self, size_bytes: int):
|
|
|
self.size = size_bytes
|
|
|
self.data = bytearray(size_bytes)
|
|
|
self.offset = 0
|
|
|
|
|
|
def allocate(self, size_bytes: int) -> Optional[int]:
|
|
|
"""Allocate memory and return offset"""
|
|
|
if self.offset + size_bytes > self.size:
|
|
|
return None
|
|
|
current_offset = self.offset
|
|
|
self.offset += size_bytes
|
|
|
return current_offset
|
|
|
|
|
|
def write(self, offset: int, data: bytes):
|
|
|
"""Write data at specified offset"""
|
|
|
if offset + len(data) > self.size:
|
|
|
raise ValueError("Write operation exceeds memory block size")
|
|
|
self.data[offset:offset + len(data)] = data
|
|
|
|
|
|
def read(self, offset: int, size: int) -> bytes:
|
|
|
"""Read data from specified offset"""
|
|
|
if offset + size > self.size:
|
|
|
raise ValueError("Read operation exceeds memory block size")
|
|
|
return bytes(self.data[offset:offset + size])
|
|
|
|
|
|
class SharedMemory(MemoryBlock):
|
|
|
"""Represents shared memory accessible by all threads in a block"""
|
|
|
def __init__(self, size_bytes: int = 48*1024):
|
|
|
super().__init__(size_bytes)
|
|
|
self.locks: Dict[int, bool] = {}
|
|
|
|
|
|
def atomic_add(self, offset: int, value: int) -> int:
|
|
|
"""Perform atomic addition"""
|
|
|
current = int.from_bytes(self.read(offset, 4), 'little')
|
|
|
new_value = current + value
|
|
|
self.write(offset, new_value.to_bytes(4, 'little'))
|
|
|
return current
|
|
|
|
|
|
class L1Cache(MemoryBlock):
|
|
|
"""Represents L1 cache memory"""
|
|
|
def __init__(self, size_bytes: int = 32*1024):
|
|
|
super().__init__(size_bytes)
|
|
|
self.cache_lines: Dict[int, bytes] = {}
|
|
|
self.line_size = 128
|
|
|
|
|
|
def load_line(self, address: int) -> bytes:
|
|
|
"""Load a cache line"""
|
|
|
line_address = address - (address % self.line_size)
|
|
|
if line_address not in self.cache_lines:
|
|
|
|
|
|
self.cache_lines[line_address] = bytes(self.line_size)
|
|
|
return self.cache_lines[line_address]
|
|
|
|
|
|
class L2Cache(MemoryBlock):
|
|
|
"""Represents L2 cache memory"""
|
|
|
def __init__(self, size_bytes: int = 1024*1024):
|
|
|
super().__init__(size_bytes)
|
|
|
self.cache_lines: Dict[int, bytes] = {}
|
|
|
self.line_size = 256
|
|
|
|
|
|
def load_line(self, address: int) -> bytes:
|
|
|
"""Load a cache line"""
|
|
|
line_address = address - (address % self.line_size)
|
|
|
if line_address not in self.cache_lines:
|
|
|
|
|
|
self.cache_lines[line_address] = bytes(self.line_size)
|
|
|
return self.cache_lines[line_address]
|
|
|
|
|
|
class RegisterFile:
|
|
|
"""Represents per-thread registers"""
|
|
|
def __init__(self, num_registers: int = 255):
|
|
|
self.registers = [0] * num_registers
|
|
|
self.used_registers = 0
|
|
|
|
|
|
def allocate(self, num: int = 1) -> Optional[int]:
|
|
|
"""Allocate registers and return starting index"""
|
|
|
if self.used_registers + num > len(self.registers):
|
|
|
return None
|
|
|
start = self.used_registers
|
|
|
self.used_registers += num
|
|
|
return start
|
|
|
|
|
|
def read(self, index: int) -> int:
|
|
|
"""Read from register"""
|
|
|
if 0 <= index < self.used_registers:
|
|
|
return self.registers[index]
|
|
|
raise IndexError("Register index out of bounds")
|
|
|
|
|
|
def write(self, index: int, value: int):
|
|
|
"""Write to register"""
|
|
|
if 0 <= index < self.used_registers:
|
|
|
self.registers[index] = value
|
|
|
else:
|
|
|
raise IndexError("Register index out of bounds")
|
|
|
|