rcgalbo commited on
Commit
bf20cb7
·
0 Parent(s):

Deploy wayyDB to HuggingFace Spaces

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +32 -0
  2. CMakeLists.txt +123 -0
  3. Dockerfile +30 -0
  4. README.md +357 -0
  5. api/__pycache__/main.cpython-310.pyc +0 -0
  6. api/__pycache__/pubsub.cpython-310.pyc +0 -0
  7. api/__pycache__/streaming.cpython-310.pyc +0 -0
  8. api/kvstore.py +150 -0
  9. api/main.py +1031 -0
  10. api/pubsub.py +547 -0
  11. api/requirements.txt +6 -0
  12. api/streaming.py +553 -0
  13. build/_deps/googletest-src +1 -0
  14. build/_deps/pybind11-src +1 -0
  15. dist/wayy_db-0.1.0-cp310-cp310-linux_x86_64.whl +0 -0
  16. include/wayy_db/column.hpp +135 -0
  17. include/wayy_db/column_view.hpp +93 -0
  18. include/wayy_db/database.hpp +87 -0
  19. include/wayy_db/hash_index.hpp +46 -0
  20. include/wayy_db/mmap_file.hpp +67 -0
  21. include/wayy_db/ops/aggregations.hpp +69 -0
  22. include/wayy_db/ops/joins.hpp +48 -0
  23. include/wayy_db/ops/window.hpp +54 -0
  24. include/wayy_db/string_column.hpp +79 -0
  25. include/wayy_db/table.hpp +133 -0
  26. include/wayy_db/types.hpp +100 -0
  27. include/wayy_db/wal.hpp +78 -0
  28. include/wayy_db/wayy_db.hpp +16 -0
  29. pyproject.toml +127 -0
  30. python/bindings.cpp +377 -0
  31. python/wayy_db/__init__.py +122 -0
  32. python/wayy_db/_core.pyi +113 -0
  33. python/wayy_db/cli/__init__.py +1 -0
  34. python/wayy_db/cli/client.py +300 -0
  35. python/wayy_db/cli/config.py +42 -0
  36. python/wayy_db/cli/deploy.py +284 -0
  37. python/wayy_db/cli/main.py +522 -0
  38. python/wayy_db/cli/output.py +76 -0
  39. python/wayy_db/ops.py +55 -0
  40. src/column.cpp +121 -0
  41. src/database.cpp +156 -0
  42. src/hash_index.cpp +62 -0
  43. src/mmap_file.cpp +154 -0
  44. src/ops/aggregations.cpp +200 -0
  45. src/ops/joins.cpp +271 -0
  46. src/ops/window.cpp +314 -0
  47. src/string_column.cpp +224 -0
  48. src/table.cpp +778 -0
  49. src/types.cpp +25 -0
  50. src/wal.cpp +225 -0
.gitignore ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+ .venv/
6
+ venv/
7
+ .env
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+ .ruff_cache/
14
+ htmlcov/
15
+ .coverage
16
+
17
+ # Node
18
+ node_modules/
19
+ .next/
20
+ dist/
21
+ .env.local
22
+
23
+ # Jupyter
24
+ .ipynb_checkpoints/
25
+
26
+ # OS
27
+ .DS_Store
28
+ *.swp
29
+
30
+ # Playwright
31
+ .playwright-mcp/
32
+
CMakeLists.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.20)
2
+ project(wayy_db VERSION 0.1.0 LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 20)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
7
+
8
+ # Options
9
+ option(WAYY_BUILD_PYTHON "Build Python bindings" ON)
10
+ option(WAYY_BUILD_TESTS "Build unit tests" ON)
11
+ option(WAYY_BUILD_BENCHMARKS "Build benchmarks" OFF)
12
+ option(WAYY_USE_AVX2 "Enable AVX2 SIMD optimizations" ON)
13
+ option(WAYY_USE_LZ4 "Enable LZ4 compression" OFF)
14
+
15
+ # Compiler flags
16
+ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
17
+ add_compile_options(-Wall -Wextra -Wpedantic)
18
+ if(WAYY_USE_AVX2)
19
+ add_compile_options(-mavx2 -mfma)
20
+ endif()
21
+ endif()
22
+
23
+ # Core library
24
+ add_library(wayy_core STATIC
25
+ src/types.cpp
26
+ src/column.cpp
27
+ src/string_column.cpp
28
+ src/hash_index.cpp
29
+ src/table.cpp
30
+ src/database.cpp
31
+ src/mmap_file.cpp
32
+ src/wal.cpp
33
+ src/ops/aggregations.cpp
34
+ src/ops/joins.cpp
35
+ src/ops/window.cpp
36
+ )
37
+
38
+ target_include_directories(wayy_core PUBLIC
39
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
40
+ $<INSTALL_INTERFACE:include>
41
+ )
42
+
43
+ # Need PIC for linking into shared libraries (Python module)
44
+ set_target_properties(wayy_core PROPERTIES POSITION_INDEPENDENT_CODE ON)
45
+
46
+ if(WAYY_USE_AVX2)
47
+ target_compile_definitions(wayy_core PUBLIC WAYY_USE_AVX2=1)
48
+ endif()
49
+
50
+ if(WAYY_USE_LZ4)
51
+ find_package(lz4 REQUIRED)
52
+ target_link_libraries(wayy_core PRIVATE lz4::lz4)
53
+ target_compile_definitions(wayy_core PUBLIC WAYY_USE_LZ4=1)
54
+ endif()
55
+
56
+ # Python bindings
57
+ if(WAYY_BUILD_PYTHON)
58
+ find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
59
+
60
+ # Fetch pybind11 (v2.13+ required for free-threaded Python support)
61
+ include(FetchContent)
62
+ FetchContent_Declare(
63
+ pybind11
64
+ GIT_REPOSITORY https://github.com/pybind/pybind11.git
65
+ GIT_TAG v2.13.6
66
+ )
67
+ FetchContent_MakeAvailable(pybind11)
68
+
69
+ pybind11_add_module(_core python/bindings.cpp)
70
+ target_link_libraries(_core PRIVATE wayy_core)
71
+
72
+ # Install Python module to the package directory
73
+ # scikit-build-core will place this in the wayy_db package
74
+ install(TARGETS _core DESTINATION wayy_db COMPONENT python)
75
+ endif()
76
+
77
+ # Tests
78
+ if(WAYY_BUILD_TESTS)
79
+ enable_testing()
80
+
81
+ # Fetch GoogleTest
82
+ include(FetchContent)
83
+ FetchContent_Declare(
84
+ googletest
85
+ GIT_REPOSITORY https://github.com/google/googletest.git
86
+ GIT_TAG v1.14.0
87
+ )
88
+ # Prevent overriding parent project's compiler/linker settings (Windows)
89
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
90
+ FetchContent_MakeAvailable(googletest)
91
+
92
+ add_executable(wayy_tests
93
+ tests/test_types.cpp
94
+ tests/test_column.cpp
95
+ tests/test_table.cpp
96
+ tests/test_mmap.cpp
97
+ tests/test_joins.cpp
98
+ )
99
+
100
+ target_link_libraries(wayy_tests PRIVATE
101
+ wayy_core
102
+ GTest::gtest
103
+ GTest::gtest_main
104
+ )
105
+
106
+ include(GoogleTest)
107
+ gtest_discover_tests(wayy_tests)
108
+ endif()
109
+
110
+ # Benchmarks
111
+ if(WAYY_BUILD_BENCHMARKS)
112
+ find_package(benchmark REQUIRED)
113
+
114
+ add_executable(wayy_benchmarks
115
+ benchmarks/bench_aggregations.cpp
116
+ benchmarks/bench_joins.cpp
117
+ )
118
+
119
+ target_link_libraries(wayy_benchmarks PRIVATE
120
+ wayy_core
121
+ benchmark::benchmark
122
+ )
123
+ endif()
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WayyDB API Docker Image
2
+ FROM python:3.12
3
+
4
+ # Install C++ toolchain and cmake via apt (more reliable than pip cmake)
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ g++ cmake ninja-build \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ RUN useradd -m -u 1000 user
10
+ RUN mkdir -p /home/user/data/wayydb /data/wayydb && \
11
+ chown -R user:user /home/user /data
12
+
13
+ USER user
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ WAYY_DATA_PATH=/data/wayydb \
17
+ PORT=8080
18
+
19
+ WORKDIR $HOME/app
20
+
21
+ RUN pip install --no-cache-dir --upgrade pip && \
22
+ pip install --no-cache-dir scikit-build-core pybind11 numpy build
23
+
24
+ COPY --chown=user . .
25
+
26
+ RUN pip install --no-cache-dir -v ".[api,cli]"
27
+
28
+ EXPOSE 8080
29
+
30
+ CMD uvicorn api.main:app --host 0.0.0.0 --port ${PORT:-8080}
README.md ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: WayyDB API
3
+ emoji: ⚡
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ ---
9
+
10
+ <p align="center">
11
+ <h1 align="center">WayyDB</h1>
12
+ <p align="center">
13
+ <strong>High-performance columnar time-series database for quantitative finance</strong>
14
+ </p>
15
+ <p align="center">
16
+ kdb+ functionality &bull; Pythonic API &bull; Zero-copy NumPy &bull; SIMD-accelerated
17
+ </p>
18
+ <p align="center">
19
+ <a href="https://pypi.org/project/wayy-db/"><img src="https://img.shields.io/pypi/v/wayy-db" alt="PyPI"></a>
20
+ <a href="https://github.com/Wayy-Research/wayyDB/actions"><img src="https://github.com/Wayy-Research/wayyDB/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
21
+ <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License: MIT"></a>
22
+ <a href="https://pypi.org/project/wayy-db/"><img src="https://img.shields.io/pypi/pyversions/wayy-db" alt="Python versions"></a>
23
+ </p>
24
+ </p>
25
+
26
+ ---
27
+
28
+ WayyDB is a C++ time-series database with Python bindings, designed for quantitative research and trading systems. It provides **kdb+-like temporal join operations** with a modern, accessible API—no q language required.
29
+
30
+ ## Why WayyDB?
31
+
32
+ | Challenge | WayyDB Solution |
33
+ |-----------|-----------------|
34
+ | kdb+ costs $100K+/year | Open source, free forever |
35
+ | q language learning curve | Pythonic API you already know |
36
+ | Pandas/Polars lack temporal joins | Native `aj()` and `wj()` primitives |
37
+ | Memory copies kill performance | Zero-copy NumPy via mmap |
38
+ | Slow aggregations | AVX2/AVX-512 SIMD acceleration |
39
+
40
+ ## Features
41
+
42
+ - **As-of Join (aj)** — For each trade, find the most recent quote. O(n log m) via binary search on sorted indices
43
+ - **Window Join (wj)** — Get all quotes within a time window around each trade
44
+ - **Zero-copy NumPy** — Columns are memory-mapped; `to_numpy()` returns views, not copies
45
+ - **SIMD Aggregations** — Sum, avg, min, max accelerated with AVX2 intrinsics
46
+ - **Window Functions** — Moving average, EMA, rolling std with O(n) complexity
47
+ - **Persistent Storage** — Tables saved as memory-mapped files for instant loading
48
+ - **Streaming API** — FastAPI REST + WebSocket endpoints for real-time tick ingestion and subscription
49
+ - **Pluggable Pub/Sub** — InMemory (default) or Redis backend for distributed deployments
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install wayy-db
55
+ ```
56
+
57
+ Or build from source:
58
+
59
+ ```bash
60
+ git clone https://github.com/wayy-research/wayydb.git
61
+ cd wayydb
62
+ pip install -e .
63
+ ```
64
+
65
+ ## Quick Start
66
+
67
+ ### Create Tables from NumPy Arrays
68
+
69
+ ```python
70
+ import wayy_db as wdb
71
+ import numpy as np
72
+
73
+ # Create trades table
74
+ trades = wdb.from_dict({
75
+ "timestamp": np.array([1000, 2000, 3000, 4000, 5000], dtype=np.int64),
76
+ "symbol": np.array([0, 1, 0, 1, 0], dtype=np.uint32), # AAPL=0, MSFT=1
77
+ "price": np.array([150.25, 380.50, 151.00, 381.25, 152.00]),
78
+ "size": np.array([100, 200, 150, 250, 100], dtype=np.int64),
79
+ }, name="trades", sorted_by="timestamp")
80
+
81
+ # Create quotes table
82
+ quotes = wdb.from_dict({
83
+ "timestamp": np.array([500, 900, 1500, 2500, 3500], dtype=np.int64),
84
+ "symbol": np.array([0, 1, 0, 1, 0], dtype=np.uint32),
85
+ "bid": np.array([149.50, 379.50, 150.50, 380.50, 151.50]),
86
+ "ask": np.array([150.00, 380.00, 151.00, 381.00, 152.00]),
87
+ }, name="quotes", sorted_by="timestamp")
88
+ ```
89
+
90
+ ### As-of Join: Match Trades to Quotes
91
+
92
+ ```python
93
+ # For each trade, get the most recent quote for that symbol
94
+ result = wdb.ops.aj(trades, quotes, on=["symbol"], as_of="timestamp")
95
+
96
+ # Result contains trade columns + quote columns (bid, ask)
97
+ print(result["bid"].to_numpy()) # [149.5, 379.5, 150.5, 380.5, 151.5]
98
+ ```
99
+
100
+ ### Aggregations and Window Functions
101
+
102
+ ```python
103
+ # SIMD-accelerated aggregations
104
+ total_volume = wdb.ops.sum(trades["size"])
105
+ avg_price = wdb.ops.avg(trades["price"])
106
+ price_std = wdb.ops.std(trades["price"])
107
+
108
+ # Window functions
109
+ mavg_20 = wdb.ops.mavg(trades["price"], window=20)
110
+ ema = wdb.ops.ema(trades["price"], alpha=0.1)
111
+ rolling_std = wdb.ops.mstd(trades["price"], window=10)
112
+
113
+ # Returns and changes
114
+ returns = wdb.ops.pct_change(trades["price"])
115
+ price_diff = wdb.ops.diff(trades["price"])
116
+ ```
117
+
118
+ ### Persistent Database
119
+
120
+ ```python
121
+ # Create persistent database
122
+ db = wdb.Database("/data/markets")
123
+
124
+ # Add table (automatically saved)
125
+ db.add_table(trades)
126
+
127
+ # Later: reload with zero-copy mmap
128
+ db2 = wdb.Database("/data/markets")
129
+ trades = db2["trades"] # Instant load via memory mapping
130
+
131
+ # Access data without copying
132
+ prices = trades["price"].to_numpy() # Zero-copy view into mmap'd file
133
+ ```
134
+
135
+ ### Pandas/Polars Interop
136
+
137
+ ```python
138
+ import pandas as pd
139
+ import polars as pl
140
+
141
+ # From pandas
142
+ df = pd.DataFrame({"timestamp": [...], "price": [...]})
143
+ table = wdb.from_pandas(df, name="from_pandas", sorted_by="timestamp")
144
+
145
+ # From polars
146
+ df = pl.DataFrame({"timestamp": [...], "price": [...]})
147
+ table = wdb.from_polars(df, name="from_polars", sorted_by="timestamp")
148
+
149
+ # To dict (for conversion back)
150
+ data = table.to_dict() # {"timestamp": np.array, "price": np.array, ...}
151
+ ```
152
+
153
+ ## API Reference
154
+
155
+ ### Core Classes
156
+
157
+ | Class | Description |
158
+ |-------|-------------|
159
+ | `Database(path="")` | Container for tables. Empty path = in-memory |
160
+ | `Table(name="")` | Columnar table with optional sorted index |
161
+ | `Column` | Typed column with zero-copy NumPy access |
162
+
163
+ ### Table Methods
164
+
165
+ ```python
166
+ table.num_rows # Number of rows
167
+ table.num_columns # Number of columns
168
+ table.column_names() # List of column names
169
+ table.sorted_by # Column used for temporal ordering (or None)
170
+ table["col"] # Get column by name
171
+ table.to_dict() # Export as {name: np.array} dict
172
+ table.save(path) # Save to directory
173
+ Table.load(path) # Load from directory (copies data)
174
+ Table.mmap(path) # Memory-map from directory (zero-copy)
175
+ ```
176
+
177
+ ### Operations (wayy_db.ops)
178
+
179
+ #### Aggregations
180
+ | Function | Description |
181
+ |----------|-------------|
182
+ | `sum(col)` | Sum of values (SIMD) |
183
+ | `avg(col)` | Mean of values |
184
+ | `min(col)` | Minimum value |
185
+ | `max(col)` | Maximum value |
186
+ | `std(col)` | Standard deviation |
187
+
188
+ #### Temporal Joins
189
+ | Function | Description |
190
+ |----------|-------------|
191
+ | `aj(left, right, on, as_of)` | As-of join: most recent right row for each left row |
192
+ | `wj(left, right, on, as_of, before, after)` | Window join: all right rows within time window |
193
+
194
+ #### Window Functions
195
+ | Function | Description |
196
+ |----------|-------------|
197
+ | `mavg(col, window)` | Moving average |
198
+ | `msum(col, window)` | Moving sum |
199
+ | `mstd(col, window)` | Moving standard deviation |
200
+ | `mmin(col, window)` | Moving minimum (O(n) via monotonic deque) |
201
+ | `mmax(col, window)` | Moving maximum (O(n) via monotonic deque) |
202
+ | `ema(col, alpha)` | Exponential moving average |
203
+ | `diff(col, periods=1)` | Difference from n periods ago |
204
+ | `pct_change(col, periods=1)` | Percent change from n periods ago |
205
+ | `shift(col, n)` | Shift values by n positions |
206
+
207
+ ## Type System
208
+
209
+ | Type | Python | C++ | Size | Use Case |
210
+ |------|--------|-----|------|----------|
211
+ | Int64 | `np.int64` | `int64_t` | 8B | Quantities, IDs |
212
+ | Float64 | `np.float64` | `double` | 8B | Prices, returns |
213
+ | Timestamp | `np.int64` | `int64_t` | 8B | Nanoseconds since epoch |
214
+ | Symbol | `np.uint32` | `uint32_t` | 4B | Interned strings (tickers) |
215
+ | Bool | `np.uint8` | `uint8_t` | 1B | Flags |
216
+
217
+ ## Architecture
218
+
219
+ ```
220
+ ┌─────────────────────────────────────────────────────────────┐
221
+ │ Python Interface │
222
+ │ wayy_db.Database | Table | Column | ops │
223
+ ├─────────────────────────────────────────────────────────────┤
224
+ │ pybind11 Bindings │
225
+ │ Zero-copy NumPy arrays via buffer protocol │
226
+ ├─────────────────────────────────────────────────────────────┤
227
+ │ C++ Core Engine │
228
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
229
+ │ │ Storage │ │ Compute │ │ Joins │ │
230
+ │ │ • mmap I/O │ │ • AVX2 agg │ │ • O(n log m) aj │ │
231
+ │ │ • columnar │ │ • windows │ │ • O(n) wj │ │
232
+ │ └─────────────┘ └─────────────┘ └─────────────────────┘ │
233
+ ├─────────────────────────────────────────────────────────────┤
234
+ │ Memory-Mapped File Storage │
235
+ │ Zero-copy | Lazy loading | Shared │
236
+ └─────────────────────────────────────────────────────────────┘
237
+ ```
238
+
239
+ ## Performance
240
+
241
+ ### Complexity
242
+
243
+ | Operation | Complexity | Notes |
244
+ |-----------|------------|-------|
245
+ | As-of join | O(n log(m/k)) | n=left rows, m=right rows, k=unique keys |
246
+ | Window join | O(n log m + matches) | Plus output size |
247
+ | Aggregations | O(n) | SIMD 4x speedup for sum |
248
+ | Window functions | O(n) | Single pass with O(1) update |
249
+ | Point lookup | O(log n) | Binary search on sorted index |
250
+ | Load from disk | O(1) | Memory mapping, no deserialization |
251
+
252
+ ### Benchmarks vs Alternatives
253
+
254
+ Run the benchmark suite yourself:
255
+ ```bash
256
+ pip install wayy-db[bench]
257
+ python -m benchmarks.benchmark --compare pandas,polars,duckdb
258
+ ```
259
+
260
+ | Operation | wayyDB | pandas | Polars | DuckDB |
261
+ |-----------|--------|--------|--------|--------|
262
+ | As-of Join (1M x 1M) | 142ms | 8,234ms (58x slower) | 568ms (4x) | 345ms (2.4x) |
263
+ | Aggregation (5 ops) | 0.8ms | 16.2ms (20x) | 4.1ms (5x) | 5.6ms (7x) |
264
+ | Create Table (1M) | 12ms | 145ms (12x) | 35ms (3x) | 89ms (7x) |
265
+ | Load from Disk (1M) | 0.05ms (mmap) | 62ms (1240x) | 18ms (360x) | 32ms (640x) |
266
+
267
+ ### Design Targets
268
+
269
+ | Metric | Target |
270
+ |--------|--------|
271
+ | As-of join (1M x 1M rows) | < 150ms |
272
+ | Simple aggregation (1B rows) | < 80ms |
273
+ | Binary size | < 5 MB |
274
+ | Memory overhead | < 1% beyond data |
275
+
276
+ ## Building from Source
277
+
278
+ ### Requirements
279
+
280
+ - CMake >= 3.20
281
+ - C++20 compiler (GCC 11+, Clang 14+, MSVC 2022+)
282
+ - Python >= 3.9
283
+
284
+ ### Build
285
+
286
+ ```bash
287
+ git clone https://github.com/wayy-research/wayydb.git
288
+ cd wayydb
289
+
290
+ # Option 1: pip install (recommended)
291
+ pip install -e .
292
+
293
+ # Option 2: CMake directly
294
+ mkdir build && cd build
295
+ cmake .. -DWAYY_BUILD_PYTHON=ON -DWAYY_BUILD_TESTS=ON
296
+ make -j$(nproc)
297
+ ```
298
+
299
+ ### Run Tests
300
+
301
+ ```bash
302
+ # C++ tests (31 tests)
303
+ cd build && ctest --output-on-failure
304
+
305
+ # Python tests (81 tests)
306
+ pytest tests/python -v
307
+ ```
308
+
309
+ ## Comparison with Alternatives
310
+
311
+ | Feature | WayyDB | kdb+ | DuckDB | Polars |
312
+ |---------|--------|------|--------|--------|
313
+ | As-of join | Native | Native | Extension | None |
314
+ | Window join | Native | Native | None | None |
315
+ | Zero-copy Python | Yes | No | No | Limited |
316
+ | Sorted index optimization | Yes | Yes | No | No |
317
+ | License | MIT | Commercial | MIT | MIT |
318
+ | Learning curve | Low | High (q) | Low | Low |
319
+ | Persistence | mmap | Native | Native | None |
320
+
321
+ ## Roadmap
322
+
323
+ - [x] Streaming ingestion API (WebSocket + REST)
324
+ - [x] Pluggable pub/sub (InMemory + Redis)
325
+ - [x] Multi-deployment Docker (Fly.io, Render, HF Spaces)
326
+ - [ ] String column type with dictionary encoding
327
+ - [ ] LZ4 compression for columns
328
+ - [ ] Parallel aggregations
329
+ - [ ] More join types (inner, left, full)
330
+ - [ ] Query optimizer
331
+
332
+ ## License
333
+
334
+ MIT License - see [LICENSE](LICENSE) for details.
335
+
336
+ ## Contributing
337
+
338
+ Contributions welcome! Please read our contributing guidelines and submit PRs to the `develop` branch.
339
+
340
+ ## Citation
341
+
342
+ If you use wayyDB in your research, please cite:
343
+
344
+ ```bibtex
345
+ @software{wayydb2026,
346
+ title = {wayyDB: A High-Performance Columnar Time-Series Database},
347
+ author = {Galbo, Rick},
348
+ year = {2026},
349
+ url = {https://github.com/Wayy-Research/wayyDB}
350
+ }
351
+ ```
352
+
353
+ ---
354
+
355
+ <p align="center">
356
+ Built with C++20 and Python by <a href="https://wayy.io">Wayy Research</a>
357
+ </p>
api/__pycache__/main.cpython-310.pyc ADDED
Binary file (26 kB). View file
 
api/__pycache__/pubsub.cpython-310.pyc ADDED
Binary file (16.4 kB). View file
 
api/__pycache__/streaming.cpython-310.pyc ADDED
Binary file (15.6 kB). View file
 
api/kvstore.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ KV Store - In-memory key-value store with TTL for wayyDB.
3
+
4
+ Provides Redis-like KV semantics for future multi-process scaling.
5
+ Background eviction runs every 60 seconds.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import time
11
+ from fnmatch import fnmatch
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class KVEntry:
18
+ """A stored value with optional TTL."""
19
+ __slots__ = ("value", "expires_at", "created_at")
20
+
21
+ def __init__(self, value: Any, ttl: Optional[float] = None):
22
+ now = time.time()
23
+ self.value = value
24
+ self.expires_at = now + ttl if ttl else float("inf")
25
+ self.created_at = now
26
+
27
+ @property
28
+ def is_expired(self) -> bool:
29
+ return time.time() > self.expires_at
30
+
31
+ @property
32
+ def ttl_remaining(self) -> Optional[float]:
33
+ if self.expires_at == float("inf"):
34
+ return None
35
+ remaining = self.expires_at - time.time()
36
+ return max(0, remaining)
37
+
38
+
39
+ class KVStore:
40
+ """
41
+ In-memory KV store with TTL and background eviction.
42
+
43
+ Thread-safe for single-process async use (GIL + event loop).
44
+ """
45
+
46
+ def __init__(self) -> None:
47
+ self._data: Dict[str, KVEntry] = {}
48
+ self._eviction_task: Optional[asyncio.Task] = None
49
+ self._sets: int = 0
50
+ self._gets: int = 0
51
+ self._deletes: int = 0
52
+ self._evictions: int = 0
53
+
54
+ async def start(self) -> None:
55
+ """Start the background eviction task."""
56
+ if self._eviction_task is None:
57
+ self._eviction_task = asyncio.create_task(self._eviction_loop())
58
+ logger.info("KVStore eviction task started")
59
+
60
+ async def stop(self) -> None:
61
+ """Stop the background eviction task."""
62
+ if self._eviction_task:
63
+ self._eviction_task.cancel()
64
+ try:
65
+ await self._eviction_task
66
+ except asyncio.CancelledError:
67
+ pass
68
+ self._eviction_task = None
69
+
70
+ def set(self, key: str, value: Any, ttl: Optional[float] = None) -> None:
71
+ """Set a key with optional TTL (seconds)."""
72
+ self._data[key] = KVEntry(value, ttl)
73
+ self._sets += 1
74
+
75
+ def get(self, key: str) -> Optional[Any]:
76
+ """Get a value by key. Returns None if missing or expired."""
77
+ self._gets += 1
78
+ entry = self._data.get(key)
79
+ if entry is None:
80
+ return None
81
+ if entry.is_expired:
82
+ del self._data[key]
83
+ self._evictions += 1
84
+ return None
85
+ return entry.value
86
+
87
+ def delete(self, key: str) -> bool:
88
+ """Delete a key. Returns True if existed."""
89
+ existed = key in self._data
90
+ if existed:
91
+ del self._data[key]
92
+ self._deletes += 1
93
+ return existed
94
+
95
+ def keys(self, pattern: Optional[str] = None) -> List[str]:
96
+ """List keys, optionally filtered by glob pattern."""
97
+ now = time.time()
98
+ result = []
99
+ for k, v in self._data.items():
100
+ if v.expires_at > now:
101
+ if pattern is None or fnmatch(k, pattern):
102
+ result.append(k)
103
+ return result
104
+
105
+ def stats(self) -> Dict[str, Any]:
106
+ """Get store statistics."""
107
+ now = time.time()
108
+ active = sum(1 for v in self._data.values() if v.expires_at > now)
109
+ return {
110
+ "total_keys": len(self._data),
111
+ "active_keys": active,
112
+ "sets": self._sets,
113
+ "gets": self._gets,
114
+ "deletes": self._deletes,
115
+ "evictions": self._evictions,
116
+ }
117
+
118
+ async def _eviction_loop(self) -> None:
119
+ """Background loop to evict expired entries every 60s."""
120
+ while True:
121
+ try:
122
+ await asyncio.sleep(60)
123
+ count = self._evict_expired()
124
+ if count > 0:
125
+ logger.debug(f"KVStore evicted {count} expired entries")
126
+ except asyncio.CancelledError:
127
+ break
128
+ except Exception as e:
129
+ logger.error(f"KVStore eviction error: {e}")
130
+
131
+ def _evict_expired(self) -> int:
132
+ """Evict all expired entries. Returns count evicted."""
133
+ now = time.time()
134
+ expired = [k for k, v in self._data.items() if now > v.expires_at]
135
+ for k in expired:
136
+ del self._data[k]
137
+ self._evictions += len(expired)
138
+ return len(expired)
139
+
140
+
141
+ # Global singleton
142
+ _kv_store: Optional[KVStore] = None
143
+
144
+
145
+ def get_kv_store() -> KVStore:
146
+ """Get the global KV store instance."""
147
+ global _kv_store
148
+ if _kv_store is None:
149
+ _kv_store = KVStore()
150
+ return _kv_store
api/main.py ADDED
@@ -0,0 +1,1031 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WayyDB REST API - High-performance columnar time-series database service
3
+
4
+ Features:
5
+ - REST API for table operations, aggregations, joins, window functions
6
+ - WebSocket streaming ingestion for real-time tick data
7
+ - WebSocket pub/sub for streaming updates to clients
8
+ - Efficient batching and append operations
9
+ """
10
+ import os
11
+ import re
12
+ import asyncio
13
+ import logging
14
+ from concurrent.futures import ThreadPoolExecutor
15
+ from contextlib import asynccontextmanager
16
+ from typing import Any, Optional, List
17
+
18
+ import numpy as np
19
+ from fastapi import FastAPI, HTTPException, Query, Request, WebSocket, WebSocketDisconnect
20
+ from fastapi.middleware.cors import CORSMiddleware
21
+ from pydantic import BaseModel, ValidationError
22
+
23
+ # Import wayyDB
24
+ import wayy_db as wdb
25
+
26
+ # Import streaming module
27
+ from api.streaming import (
28
+ get_streaming_manager,
29
+ start_streaming,
30
+ stop_streaming,
31
+ StreamingManager,
32
+ )
33
+
34
+ # Import KV store
35
+ from api.kvstore import get_kv_store
36
+
37
+ # Configure logging
38
+ logging.basicConfig(level=logging.INFO)
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # Thread pool for running CPU-bound wayyDB operations
42
+ executor = ThreadPoolExecutor(max_workers=4)
43
+
44
+ # Global database instance
45
+ db: Optional[wdb.Database] = None
46
+
47
+
48
+ @asynccontextmanager
49
+ async def lifespan(app: FastAPI):
50
+ """Initialize database and streaming on startup."""
51
+ global db
52
+ data_path = os.environ.get("WAYY_DATA_PATH", "/data/wayydb")
53
+ os.makedirs(data_path, exist_ok=True)
54
+ db = wdb.Database(data_path)
55
+
56
+ # Initialize streaming manager with database reference
57
+ streaming = get_streaming_manager()
58
+ streaming.set_database(db)
59
+ await start_streaming()
60
+
61
+ # Start KV store eviction
62
+ kv = get_kv_store()
63
+ await kv.start()
64
+
65
+ logger.info(f"WayyDB started with data path: {data_path}")
66
+
67
+ yield
68
+
69
+ # Cleanup
70
+ await kv.stop()
71
+ await stop_streaming()
72
+ if db:
73
+ db.save()
74
+ logger.info("WayyDB shutdown complete")
75
+
76
+
77
+ app = FastAPI(
78
+ title="WayyDB API",
79
+ description="High-performance columnar time-series database with kdb+-like functionality",
80
+ version="0.1.0",
81
+ lifespan=lifespan,
82
+ )
83
+
84
+ # CORS - configurable via CORS_ORIGINS env var
85
+ ALLOWED_ORIGINS = os.getenv("CORS_ORIGINS", "http://localhost:3000").split(",")
86
+
87
+ app.add_middleware(
88
+ CORSMiddleware,
89
+ allow_origins=ALLOWED_ORIGINS,
90
+ allow_credentials=True,
91
+ allow_methods=["GET", "POST", "PUT", "DELETE"],
92
+ allow_headers=["Content-Type", "Authorization"],
93
+ )
94
+
95
+
96
+ # --- Pydantic Models ---
97
+
98
+ class TableCreate(BaseModel):
99
+ name: str
100
+ sorted_by: Optional[str] = None
101
+
102
+
103
+ class ColumnData(BaseModel):
104
+ name: str
105
+ dtype: str # "int64", "float64", "timestamp", "symbol", "bool"
106
+ data: list
107
+
108
+
109
+ class TableData(BaseModel):
110
+ name: str
111
+ columns: list[ColumnData]
112
+ sorted_by: Optional[str] = None
113
+
114
+
115
+ class AggregationResult(BaseModel):
116
+ column: str
117
+ operation: str
118
+ result: float
119
+
120
+
121
+ class JoinRequest(BaseModel):
122
+ left_table: str
123
+ right_table: str
124
+ on: list[str]
125
+ as_of: str
126
+ window_before: Optional[int] = None # For window join
127
+ window_after: Optional[int] = None
128
+
129
+
130
+ class WindowRequest(BaseModel):
131
+ table: str
132
+ column: str
133
+ operation: str # mavg, msum, mstd, mmin, mmax, ema
134
+ window: Optional[int] = None
135
+ alpha: Optional[float] = None # For EMA
136
+
137
+
138
+ class AppendData(BaseModel):
139
+ """Data to append to an existing table."""
140
+ columns: list[ColumnData]
141
+
142
+
143
+ class RowData(BaseModel):
144
+ """A single row as key-value pairs."""
145
+ data: dict[str, Any]
146
+
147
+
148
+ class TableCreateOLTP(BaseModel):
149
+ """Create a table with OLTP schema definition."""
150
+ name: str
151
+ columns: list[dict] # [{"name": "id", "dtype": "string"}, ...]
152
+ primary_key: Optional[str] = None
153
+ sorted_by: Optional[str] = None
154
+
155
+
156
+ class IngestTick(BaseModel):
157
+ """A single tick for streaming ingestion."""
158
+ symbol: str
159
+ price: float
160
+ timestamp: Optional[int] = None # Nanoseconds since epoch
161
+ volume: Optional[float] = 0.0
162
+ bid: Optional[float] = None
163
+ ask: Optional[float] = None
164
+
165
+
166
+ class IngestBatch(BaseModel):
167
+ """Batch of ticks for streaming ingestion."""
168
+ ticks: list[IngestTick]
169
+
170
+
171
+ class SubscribeRequest(BaseModel):
172
+ """Subscription filter for WebSocket."""
173
+ symbols: Optional[list[str]] = None # None = all symbols
174
+
175
+
176
+ # --- Helper Functions ---
177
+
178
+ def dtype_from_string(s: str) -> wdb.DType:
179
+ mapping = {
180
+ "int64": wdb.DType.Int64,
181
+ "float64": wdb.DType.Float64,
182
+ "timestamp": wdb.DType.Timestamp,
183
+ "symbol": wdb.DType.Symbol,
184
+ "bool": wdb.DType.Bool,
185
+ }
186
+ # These types exist in C++ headers but aren't yet exposed in pybind11 bindings
187
+ # "string": _DTYPE_STRING,
188
+ # "decimal6": wdb.DType.Decimal6,
189
+ if s.lower() not in mapping:
190
+ raise ValueError(f"Unknown dtype: {s}. Available: {list(mapping.keys())}")
191
+ return mapping[s.lower()]
192
+
193
+
194
+ # String DType not yet in pybind11 bindings — use sentinel for safe comparisons
195
+ _DTYPE_STRING = getattr(wdb.DType, "String", None)
196
+
197
+
198
+ TABLE_NAME_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]{0,63}$')
199
+
200
+
201
+ def validate_table_name(name: str) -> str:
202
+ if not TABLE_NAME_RE.match(name):
203
+ raise HTTPException(400, f"Invalid table name: {name}")
204
+ return name
205
+
206
+
207
+ def numpy_dtype_for(dtype: wdb.DType):
208
+ mapping = {
209
+ wdb.DType.Int64: np.int64,
210
+ wdb.DType.Float64: np.float64,
211
+ wdb.DType.Timestamp: np.int64,
212
+ wdb.DType.Symbol: np.uint32,
213
+ wdb.DType.Bool: np.uint8,
214
+ }
215
+ return mapping[dtype]
216
+
217
+
218
+ async def run_in_executor(func, *args):
219
+ """Run blocking wayyDB operations in thread pool."""
220
+ loop = asyncio.get_event_loop()
221
+ return await loop.run_in_executor(executor, func, *args)
222
+
223
+
224
+ # --- Routes ---
225
+
226
+ @app.get("/")
227
+ async def root():
228
+ return {
229
+ "service": "WayyDB API",
230
+ "version": "0.1.0",
231
+ "status": "healthy",
232
+ }
233
+
234
+
235
+ @app.get("/health")
236
+ async def health():
237
+ return {"status": "healthy", "tables": len(db.tables()) if db else 0}
238
+
239
+
240
+ # --- Table Operations ---
241
+
242
+ @app.get("/tables")
243
+ async def list_tables():
244
+ """List all tables in the database."""
245
+ return {"tables": db.tables()}
246
+
247
+
248
+ @app.post("/tables")
249
+ async def create_table(table: TableCreate):
250
+ """Create a new empty table."""
251
+ if db.has_table(table.name):
252
+ raise HTTPException(400, f"Table '{table.name}' already exists")
253
+
254
+ t = db.create_table(table.name)
255
+ if table.sorted_by:
256
+ t.set_sorted_by(table.sorted_by)
257
+ db.save()
258
+ return {"created": table.name}
259
+
260
+
261
+ @app.post("/tables/upload")
262
+ async def upload_table(table_data: TableData):
263
+ """Upload a complete table with data."""
264
+ if db.has_table(table_data.name):
265
+ raise HTTPException(400, f"Table '{table_data.name}' already exists")
266
+
267
+ t = wdb.Table(table_data.name)
268
+
269
+ for col in table_data.columns:
270
+ dtype = dtype_from_string(col.dtype)
271
+ np_dtype = numpy_dtype_for(dtype)
272
+ arr = np.array(col.data, dtype=np_dtype)
273
+ t.add_column_from_numpy(col.name, arr, dtype)
274
+
275
+ if table_data.sorted_by:
276
+ t.set_sorted_by(table_data.sorted_by)
277
+
278
+ db.add_table(t)
279
+ db.save()
280
+
281
+ return {
282
+ "created": table_data.name,
283
+ "rows": t.num_rows,
284
+ "columns": t.column_names(),
285
+ }
286
+
287
+
288
+ @app.get("/tables/{name}")
289
+ async def get_table_info(name: str):
290
+ """Get table metadata."""
291
+ if not db.has_table(name):
292
+ raise HTTPException(404, f"Table '{name}' not found")
293
+
294
+ t = db[name]
295
+ return {
296
+ "name": t.name,
297
+ "num_rows": t.num_rows,
298
+ "num_columns": t.num_columns,
299
+ "columns": t.column_names(),
300
+ "sorted_by": t.sorted_by,
301
+ }
302
+
303
+
304
+ @app.get("/tables/{name}/data")
305
+ async def get_table_data(
306
+ name: str,
307
+ limit: int = Query(default=100, le=10000),
308
+ offset: int = Query(default=0, ge=0),
309
+ ):
310
+ """Get table data as JSON."""
311
+ if not db.has_table(name):
312
+ raise HTTPException(404, f"Table '{name}' not found")
313
+
314
+ t = db[name]
315
+ end = min(offset + limit, t.num_rows)
316
+
317
+ result = {}
318
+ for col_name in t.column_names():
319
+ col = t[col_name]
320
+ arr = col.to_numpy()[offset:end]
321
+ result[col_name] = arr.tolist()
322
+
323
+ return {
324
+ "table": name,
325
+ "offset": offset,
326
+ "limit": limit,
327
+ "total_rows": t.num_rows,
328
+ "data": result,
329
+ }
330
+
331
+
332
+ @app.delete("/tables/{name}")
333
+ async def delete_table(name: str):
334
+ """Delete a table."""
335
+ if not db.has_table(name):
336
+ raise HTTPException(404, f"Table '{name}' not found")
337
+
338
+ db.drop_table(name)
339
+ return {"deleted": name}
340
+
341
+
342
+ # --- Aggregations ---
343
+
344
+ @app.get("/tables/{name}/agg/{column}/{operation}")
345
+ async def aggregate(name: str, column: str, operation: str):
346
+ """
347
+ Run aggregation on a column.
348
+ Operations: sum, avg, min, max, std
349
+ """
350
+ if not db.has_table(name):
351
+ raise HTTPException(404, f"Table '{name}' not found")
352
+
353
+ t = db[name]
354
+ if not t.has_column(column):
355
+ raise HTTPException(404, f"Column '{column}' not found")
356
+
357
+ col = t[column]
358
+
359
+ ops_map = {
360
+ "sum": wdb.ops.sum,
361
+ "avg": wdb.ops.avg,
362
+ "min": wdb.ops.min,
363
+ "max": wdb.ops.max,
364
+ "std": wdb.ops.std,
365
+ }
366
+
367
+ if operation not in ops_map:
368
+ raise HTTPException(400, f"Unknown operation: {operation}")
369
+
370
+ # Run in thread pool for concurrency
371
+ result = await run_in_executor(ops_map[operation], col)
372
+
373
+ return AggregationResult(column=column, operation=operation, result=result)
374
+
375
+
376
+ # --- Joins ---
377
+
378
+ @app.post("/join/aj")
379
+ async def as_of_join(req: JoinRequest):
380
+ """
381
+ As-of join: find most recent right row for each left row.
382
+ Both tables must be sorted by the as_of column.
383
+ """
384
+ if not db.has_table(req.left_table):
385
+ raise HTTPException(404, f"Table '{req.left_table}' not found")
386
+ if not db.has_table(req.right_table):
387
+ raise HTTPException(404, f"Table '{req.right_table}' not found")
388
+
389
+ left = db[req.left_table]
390
+ right = db[req.right_table]
391
+
392
+ def do_join():
393
+ return wdb.ops.aj(left, right, req.on, req.as_of)
394
+
395
+ result = await run_in_executor(do_join)
396
+
397
+ # Return as dict
398
+ data = {}
399
+ for col_name in result.column_names():
400
+ data[col_name] = result[col_name].to_numpy().tolist()
401
+
402
+ return {
403
+ "join_type": "as_of",
404
+ "rows": result.num_rows,
405
+ "columns": result.column_names(),
406
+ "data": data,
407
+ }
408
+
409
+
410
+ @app.post("/join/wj")
411
+ async def window_join(req: JoinRequest):
412
+ """
413
+ Window join: find all right rows within time window.
414
+ """
415
+ if not db.has_table(req.left_table):
416
+ raise HTTPException(404, f"Table '{req.left_table}' not found")
417
+ if not db.has_table(req.right_table):
418
+ raise HTTPException(404, f"Table '{req.right_table}' not found")
419
+
420
+ if req.window_before is None or req.window_after is None:
421
+ raise HTTPException(400, "window_before and window_after required for window join")
422
+
423
+ left = db[req.left_table]
424
+ right = db[req.right_table]
425
+
426
+ def do_join():
427
+ return wdb.ops.wj(left, right, req.on, req.as_of,
428
+ req.window_before, req.window_after)
429
+
430
+ result = await run_in_executor(do_join)
431
+
432
+ data = {}
433
+ for col_name in result.column_names():
434
+ data[col_name] = result[col_name].to_numpy().tolist()
435
+
436
+ return {
437
+ "join_type": "window",
438
+ "rows": result.num_rows,
439
+ "columns": result.column_names(),
440
+ "data": data,
441
+ }
442
+
443
+
444
+ # --- Window Functions ---
445
+
446
+ @app.post("/window")
447
+ async def window_function(req: WindowRequest):
448
+ """
449
+ Apply window function to a column.
450
+ Operations: mavg, msum, mstd, mmin, mmax, ema, diff, pct_change
451
+ """
452
+ if not db.has_table(req.table):
453
+ raise HTTPException(404, f"Table '{req.table}' not found")
454
+
455
+ t = db[req.table]
456
+ if not t.has_column(req.column):
457
+ raise HTTPException(404, f"Column '{req.column}' not found")
458
+
459
+ col = t[req.column]
460
+
461
+ def do_window():
462
+ if req.operation == "mavg":
463
+ return wdb.ops.mavg(col, req.window)
464
+ elif req.operation == "msum":
465
+ return wdb.ops.msum(col, req.window)
466
+ elif req.operation == "mstd":
467
+ return wdb.ops.mstd(col, req.window)
468
+ elif req.operation == "mmin":
469
+ return wdb.ops.mmin(col, req.window)
470
+ elif req.operation == "mmax":
471
+ return wdb.ops.mmax(col, req.window)
472
+ elif req.operation == "ema":
473
+ return wdb.ops.ema(col, req.alpha)
474
+ elif req.operation == "diff":
475
+ return wdb.ops.diff(col, req.window or 1)
476
+ elif req.operation == "pct_change":
477
+ return wdb.ops.pct_change(col, req.window or 1)
478
+ else:
479
+ raise ValueError(f"Unknown operation: {req.operation}")
480
+
481
+ result = await run_in_executor(do_window)
482
+
483
+ return {
484
+ "table": req.table,
485
+ "column": req.column,
486
+ "operation": req.operation,
487
+ "result": result.tolist(),
488
+ }
489
+
490
+
491
+ # --- Append API ---
492
+
493
+ @app.post("/tables/{name}/append")
494
+ async def append_to_table(name: str, data: AppendData):
495
+ """
496
+ Append rows to an existing table.
497
+
498
+ This is more efficient than re-uploading the entire table.
499
+ The new data must have the same columns as the existing table.
500
+ """
501
+ if not db.has_table(name):
502
+ raise HTTPException(404, f"Table '{name}' not found")
503
+
504
+ existing = db[name]
505
+ existing_cols = set(existing.column_names())
506
+
507
+ # Validate columns match
508
+ new_cols = {col.name for col in data.columns}
509
+ if existing_cols != new_cols:
510
+ raise HTTPException(
511
+ 400,
512
+ f"Column mismatch. Expected: {sorted(existing_cols)}, got: {sorted(new_cols)}"
513
+ )
514
+
515
+ # Get existing data
516
+ existing_data = {}
517
+ for col_name in existing.column_names():
518
+ existing_data[col_name] = existing[col_name].to_numpy()
519
+
520
+ # Prepare new data
521
+ new_data = {}
522
+ for col in data.columns:
523
+ dtype = dtype_from_string(col.dtype)
524
+ np_dtype = numpy_dtype_for(dtype)
525
+ new_data[col.name] = np.array(col.data, dtype=np_dtype)
526
+
527
+ # Concatenate
528
+ combined = {}
529
+ for col_name in existing_cols:
530
+ combined[col_name] = np.concatenate([existing_data[col_name], new_data[col_name]])
531
+
532
+ # Get sorted_by before dropping
533
+ sorted_by = existing.sorted_by
534
+
535
+ # Drop and recreate
536
+ db.drop_table(name)
537
+ new_table = wdb.from_dict(combined, name=name, sorted_by=sorted_by)
538
+ db.add_table(new_table)
539
+ db.save()
540
+
541
+ return {
542
+ "appended": name,
543
+ "new_rows": len(data.columns[0].data) if data.columns else 0,
544
+ "total_rows": new_table.num_rows,
545
+ }
546
+
547
+
548
+ # --- OLTP / CRUD API ---
549
+
550
+ @app.post("/api/v1/{db_name}/tables")
551
+ async def create_oltp_table(db_name: str, schema: TableCreateOLTP):
552
+ """Create a table with typed columns and optional primary key."""
553
+ validate_table_name(schema.name)
554
+
555
+ if db.has_table(schema.name):
556
+ raise HTTPException(400, f"Table '{schema.name}' already exists")
557
+
558
+ t = db.create_table(schema.name)
559
+
560
+ # Add columns based on schema
561
+ for col_def in schema.columns:
562
+ col_name = col_def["name"]
563
+ dtype_str = col_def["dtype"]
564
+ dtype = dtype_from_string(dtype_str)
565
+ np_dtype = numpy_dtype_for(dtype)
566
+ arr = np.array([], dtype=np_dtype)
567
+ t.add_column_from_numpy(col_name, arr, dtype)
568
+
569
+ if schema.sorted_by:
570
+ t.set_sorted_by(schema.sorted_by)
571
+ if schema.primary_key:
572
+ t.set_primary_key(schema.primary_key)
573
+
574
+ db.save()
575
+ return {"created": schema.name, "columns": [c["name"] for c in schema.columns]}
576
+
577
+
578
+ @app.post("/api/v1/{db_name}/tables/{table_name}/rows")
579
+ async def insert_row(db_name: str, table_name: str, row: RowData):
580
+ """Insert a single row into a table."""
581
+ if not db.has_table(table_name):
582
+ raise HTTPException(404, f"Table '{table_name}' not found")
583
+
584
+ t = db[table_name]
585
+ try:
586
+ row_idx = t.append_row(row.data)
587
+ except Exception as e:
588
+ raise HTTPException(400, str(e))
589
+
590
+ return {"table": table_name, "row_index": row_idx}
591
+
592
+
593
+ @app.put("/api/v1/{db_name}/tables/{table_name}/rows/{pk}")
594
+ async def update_row(db_name: str, table_name: str, pk: str, row: RowData):
595
+ """Update a row by primary key."""
596
+ if not db.has_table(table_name):
597
+ raise HTTPException(404, f"Table '{table_name}' not found")
598
+
599
+ t = db[table_name]
600
+ if not t.primary_key:
601
+ raise HTTPException(400, "Table has no primary key set")
602
+
603
+ pk_dtype = t.column_dtype(t.primary_key)
604
+
605
+ try:
606
+ if pk_dtype == _DTYPE_STRING:
607
+ ok = t.update_row(pk, row.data)
608
+ else:
609
+ ok = t.update_row(int(pk), row.data)
610
+ except Exception as e:
611
+ raise HTTPException(400, str(e))
612
+
613
+ if not ok:
614
+ raise HTTPException(404, f"Row with pk={pk} not found")
615
+
616
+ return {"table": table_name, "pk": pk, "updated": True}
617
+
618
+
619
+ @app.delete("/api/v1/{db_name}/tables/{table_name}/rows/{pk}")
620
+ async def delete_row(db_name: str, table_name: str, pk: str):
621
+ """Soft-delete a row by primary key."""
622
+ if not db.has_table(table_name):
623
+ raise HTTPException(404, f"Table '{table_name}' not found")
624
+
625
+ t = db[table_name]
626
+ if not t.primary_key:
627
+ raise HTTPException(400, "Table has no primary key set")
628
+
629
+ pk_dtype = t.column_dtype(t.primary_key)
630
+
631
+ if pk_dtype == _DTYPE_STRING:
632
+ ok = t.delete_row(pk)
633
+ else:
634
+ ok = t.delete_row(int(pk))
635
+
636
+ if not ok:
637
+ raise HTTPException(404, f"Row with pk={pk} not found")
638
+
639
+ return {"table": table_name, "pk": pk, "deleted": True}
640
+
641
+
642
+ def _read_row_at(t, row_idx: int) -> dict[str, Any]:
643
+ """Read a single row from a table by index, returning a dict."""
644
+ row = {}
645
+ for col_name in t.column_names():
646
+ if t.has_string_column(col_name):
647
+ scol = t.string_column(col_name)
648
+ row[col_name] = scol.get(row_idx)
649
+ else:
650
+ col = t.column(col_name)
651
+ arr = col.to_numpy()
652
+ val = arr[row_idx]
653
+ # Convert numpy types to Python native for JSON serialization
654
+ row[col_name] = val.item() if hasattr(val, "item") else val
655
+ return row
656
+
657
+
658
+ @app.get("/api/v1/{db_name}/tables/{table_name}/rows/{pk}")
659
+ async def get_row_by_pk(db_name: str, table_name: str, pk: str):
660
+ """Get a single row by primary key."""
661
+ if not db.has_table(table_name):
662
+ raise HTTPException(404, f"Table '{table_name}' not found")
663
+
664
+ t = db[table_name]
665
+ if not t.primary_key:
666
+ raise HTTPException(400, "Table has no primary key set")
667
+
668
+ pk_dtype = t.column_dtype(t.primary_key)
669
+
670
+ if pk_dtype == _DTYPE_STRING:
671
+ row_idx = t.find_row(pk)
672
+ else:
673
+ row_idx = t.find_row(int(pk))
674
+
675
+ if row_idx is None:
676
+ raise HTTPException(404, f"Row with pk={pk} not found")
677
+
678
+ return {"data": _read_row_at(t, row_idx)}
679
+
680
+
681
+ @app.get("/api/v1/{db_name}/tables/{table_name}/rows")
682
+ async def filter_rows(db_name: str, table_name: str, request: Request):
683
+ """Filter rows by query parameters (col=val). Returns matching row data."""
684
+ if not db.has_table(table_name):
685
+ raise HTTPException(404, f"Table '{table_name}' not found")
686
+
687
+ t = db[table_name]
688
+ params = dict(request.query_params)
689
+ limit = int(params.pop("limit", "500"))
690
+
691
+ # Intersect filter results across all query params
692
+ row_indices = None
693
+ for col, val in params.items():
694
+ if not t.has_column(col) and not t.has_string_column(col):
695
+ continue
696
+ try:
697
+ col_dtype = t.column_dtype(col)
698
+ if col_dtype == _DTYPE_STRING:
699
+ matches = set(t.where_eq(col, val))
700
+ else:
701
+ matches = set(t.where_eq(col, int(val)))
702
+ except Exception:
703
+ continue
704
+ row_indices = matches if row_indices is None else row_indices & matches
705
+
706
+ # If no filters, return all valid rows
707
+ if row_indices is None:
708
+ row_indices = set(range(t.num_rows))
709
+
710
+ # Sort and limit
711
+ sorted_indices = sorted(row_indices)[:limit]
712
+
713
+ rows = [_read_row_at(t, idx) for idx in sorted_indices]
714
+ return {"data": rows, "count": len(rows)}
715
+
716
+
717
+ @app.post("/api/v1/{db_name}/checkpoint")
718
+ async def checkpoint(db_name: str):
719
+ """Flush WAL, save all tables to disk, truncate WAL."""
720
+ db.checkpoint()
721
+ return {"checkpoint": "ok"}
722
+
723
+
724
+ # --- Streaming Ingestion API ---
725
+
726
+ @app.post("/ingest/{table}")
727
+ async def ingest_tick(table: str, tick: IngestTick):
728
+ """
729
+ Ingest a single tick via REST.
730
+
731
+ For high-throughput, use the WebSocket endpoint instead.
732
+ """
733
+ validate_table_name(table)
734
+ streaming = get_streaming_manager()
735
+ await streaming.ingest_tick(
736
+ table=table,
737
+ symbol=tick.symbol,
738
+ price=tick.price,
739
+ timestamp=tick.timestamp,
740
+ volume=tick.volume or 0.0,
741
+ bid=tick.bid or tick.price,
742
+ ask=tick.ask or tick.price,
743
+ )
744
+ return {"ingested": 1, "table": table}
745
+
746
+
747
+ @app.post("/ingest/{table}/batch")
748
+ async def ingest_batch(table: str, batch: IngestBatch):
749
+ """
750
+ Ingest a batch of ticks via REST.
751
+
752
+ For high-throughput, use the WebSocket endpoint instead.
753
+ """
754
+ validate_table_name(table)
755
+ streaming = get_streaming_manager()
756
+ ticks = [
757
+ {
758
+ "symbol": t.symbol,
759
+ "price": t.price,
760
+ "timestamp": t.timestamp,
761
+ "volume": t.volume or 0.0,
762
+ "bid": t.bid or t.price,
763
+ "ask": t.ask or t.price,
764
+ }
765
+ for t in batch.ticks
766
+ ]
767
+ await streaming.ingest_batch(table=table, ticks=ticks)
768
+ return {"ingested": len(ticks), "table": table}
769
+
770
+
771
+ # --- WebSocket Endpoints ---
772
+
773
+ @app.websocket("/ws/ingest/{table}")
774
+ async def ws_ingest(websocket: WebSocket, table: str):
775
+ """
776
+ WebSocket endpoint for streaming tick ingestion.
777
+
778
+ Send JSON messages with tick data:
779
+ {
780
+ "symbol": "BTC-USD",
781
+ "price": 42150.50,
782
+ "timestamp": 1704067200000000000, // Optional, nanoseconds
783
+ "volume": 1.5, // Optional
784
+ "bid": 42150.00, // Optional
785
+ "ask": 42151.00 // Optional
786
+ }
787
+
788
+ Or batches:
789
+ {
790
+ "batch": [
791
+ {"symbol": "BTC-USD", "price": 42150.50, ...},
792
+ {"symbol": "ETH-USD", "price": 2250.25, ...}
793
+ ]
794
+ }
795
+ """
796
+ await websocket.accept()
797
+ streaming = get_streaming_manager()
798
+
799
+ logger.info(f"Ingestion WebSocket connected for table: {table}")
800
+
801
+ try:
802
+ while True:
803
+ data = await websocket.receive_json()
804
+
805
+ if "batch" in data:
806
+ # Batch ingestion
807
+ ticks = data["batch"]
808
+ await streaming.ingest_batch(table=table, ticks=ticks)
809
+ await websocket.send_json({"ack": len(ticks)})
810
+ else:
811
+ # Single tick
812
+ await streaming.ingest_tick(
813
+ table=table,
814
+ symbol=data["symbol"],
815
+ price=data["price"],
816
+ timestamp=data.get("timestamp"),
817
+ volume=data.get("volume", 0.0),
818
+ bid=data.get("bid", data["price"]),
819
+ ask=data.get("ask", data["price"]),
820
+ )
821
+ await websocket.send_json({"ack": 1})
822
+
823
+ except WebSocketDisconnect:
824
+ logger.info(f"Ingestion WebSocket disconnected for table: {table}")
825
+ except Exception as e:
826
+ logger.error(f"Ingestion WebSocket error: {e}")
827
+ await websocket.close(code=1011, reason=str(e))
828
+
829
+
830
+ @app.websocket("/ws/subscribe/{table}")
831
+ async def ws_subscribe(websocket: WebSocket, table: str):
832
+ """
833
+ WebSocket endpoint for subscribing to real-time updates.
834
+
835
+ Optionally send a filter message after connecting:
836
+ {"symbols": ["BTC-USD", "ETH-USD"]}
837
+
838
+ Receives updates as:
839
+ {
840
+ "symbol": "BTC-USD",
841
+ "price": 42150.50,
842
+ "bid": 42150.00,
843
+ "ask": 42151.00,
844
+ "volume": 1.5,
845
+ "timestamp": 1704067200000000000,
846
+ "table": "ticks"
847
+ }
848
+
849
+ Or batches during high-throughput:
850
+ {"batch": [...]}
851
+ """
852
+ await websocket.accept()
853
+ streaming = get_streaming_manager()
854
+
855
+ # Default: subscribe to all symbols
856
+ symbols = None
857
+
858
+ # Check for initial filter message (non-blocking)
859
+ try:
860
+ # Wait briefly for filter message
861
+ data = await asyncio.wait_for(websocket.receive_json(), timeout=0.5)
862
+ if "symbols" in data:
863
+ symbols = data["symbols"]
864
+ logger.info(f"Subscription filter: {symbols}")
865
+ except asyncio.TimeoutError:
866
+ pass
867
+ except Exception:
868
+ pass
869
+
870
+ subscriber = await streaming.subscribe(websocket, table, symbols)
871
+ logger.info(f"Subscription WebSocket connected for table: {table}, symbols: {symbols or 'all'}")
872
+
873
+ try:
874
+ # Keep connection alive, handle any incoming messages
875
+ while True:
876
+ try:
877
+ data = await websocket.receive_json()
878
+ # Handle filter updates
879
+ if "symbols" in data:
880
+ subscriber.symbols = set(data["symbols"]) if data["symbols"] else set()
881
+ await websocket.send_json({"filter_updated": list(subscriber.symbols) or "all"})
882
+ except WebSocketDisconnect:
883
+ raise
884
+ except Exception:
885
+ pass
886
+
887
+ except WebSocketDisconnect:
888
+ logger.info(f"Subscription WebSocket disconnected for table: {table}")
889
+ finally:
890
+ await streaming.unsubscribe(websocket, table)
891
+
892
+
893
+ # --- Streaming Stats ---
894
+
895
+ @app.get("/streaming/stats")
896
+ async def streaming_stats():
897
+ """Get streaming ingestion and pub/sub statistics."""
898
+ streaming = get_streaming_manager()
899
+ return streaming.get_stats()
900
+
901
+
902
+ @app.get("/streaming/quote/{table}/{symbol}")
903
+ async def get_quote(table: str, symbol: str):
904
+ """Get the latest quote for a symbol from the streaming cache."""
905
+ streaming = get_streaming_manager()
906
+ quote = streaming.get_latest_quote(table, symbol)
907
+ if not quote:
908
+ raise HTTPException(404, f"No quote for {symbol} in {table}")
909
+ return quote
910
+
911
+
912
+ @app.get("/streaming/quotes/{table}")
913
+ async def get_all_quotes(table: str):
914
+ """Get all latest quotes for a table from the streaming cache."""
915
+ streaming = get_streaming_manager()
916
+ return streaming.get_all_quotes(table)
917
+
918
+
919
+ @app.get("/streaming/pubsub")
920
+ async def pubsub_stats():
921
+ """Get pub/sub backend statistics (channels, sequences, backend type)."""
922
+ streaming = get_streaming_manager()
923
+ stats = streaming.get_stats()
924
+ return stats.get("pubsub", {"backend": "none", "info": "PubSub not configured"})
925
+
926
+
927
+ # --- KV Store API ---
928
+
929
+ class KVSetRequest(BaseModel):
930
+ """Request body for setting a KV entry."""
931
+ value: Any
932
+ ttl: Optional[float] = None # TTL in seconds, None = no expiry
933
+
934
+
935
+ @app.post("/kv/{key}")
936
+ async def kv_set(key: str, req: KVSetRequest):
937
+ """Set a key-value pair with optional TTL."""
938
+ kv = get_kv_store()
939
+ kv.set(key, req.value, ttl=req.ttl)
940
+ return {"key": key, "ttl": req.ttl}
941
+
942
+
943
+ @app.get("/kv/{key}")
944
+ async def kv_get(key: str):
945
+ """Get a value by key."""
946
+ kv = get_kv_store()
947
+ value = kv.get(key)
948
+ if value is None:
949
+ raise HTTPException(404, f"Key '{key}' not found or expired")
950
+ return {"key": key, "value": value}
951
+
952
+
953
+ @app.delete("/kv/{key}")
954
+ async def kv_delete(key: str):
955
+ """Delete a key."""
956
+ kv = get_kv_store()
957
+ existed = kv.delete(key)
958
+ if not existed:
959
+ raise HTTPException(404, f"Key '{key}' not found")
960
+ return {"deleted": key}
961
+
962
+
963
+ @app.get("/kv")
964
+ async def kv_list(pattern: Optional[str] = None):
965
+ """List keys, optionally filtered by glob pattern."""
966
+ kv = get_kv_store()
967
+ keys = kv.keys(pattern)
968
+ return {"keys": keys, "count": len(keys)}
969
+
970
+
971
+ @app.get("/kv-stats")
972
+ async def kv_stats():
973
+ """Get KV store statistics."""
974
+ kv = get_kv_store()
975
+ return kv.stats()
976
+
977
+
978
+ # --- General Pub/Sub API ---
979
+
980
+ class PubSubPublishRequest(BaseModel):
981
+ """Request body for publishing to a channel."""
982
+ data: Any
983
+
984
+
985
+ @app.post("/pubsub/publish/{channel}")
986
+ async def pubsub_publish(channel: str, req: PubSubPublishRequest):
987
+ """Publish a message to a channel."""
988
+ streaming = get_streaming_manager()
989
+ # Use the streaming manager's broadcast mechanism
990
+ # For general pub/sub, we broadcast to WebSocket subscribers
991
+ await streaming.broadcast_to_channel(channel, req.data)
992
+ return {"channel": channel, "published": True}
993
+
994
+
995
+ @app.websocket("/ws/pubsub")
996
+ async def ws_pubsub(websocket: WebSocket):
997
+ """
998
+ General pub/sub WebSocket endpoint.
999
+
1000
+ Send subscription request after connecting:
1001
+ {"action": "subscribe", "channels": ["prices:*", "trades"]}
1002
+
1003
+ Receives messages as:
1004
+ {"channel": "prices:BTC-USD", "data": {...}}
1005
+ """
1006
+ await websocket.accept()
1007
+ streaming = get_streaming_manager()
1008
+
1009
+ subscribed_channels: list[str] = []
1010
+
1011
+ logger.info("PubSub WebSocket connected")
1012
+
1013
+ try:
1014
+ while True:
1015
+ data = await websocket.receive_json()
1016
+
1017
+ action = data.get("action")
1018
+ if action == "subscribe":
1019
+ channels = data.get("channels", [])
1020
+ subscribed_channels.extend(channels)
1021
+ await websocket.send_json({
1022
+ "type": "subscribed",
1023
+ "channels": subscribed_channels,
1024
+ })
1025
+ elif action == "ping":
1026
+ await websocket.send_json({"type": "pong"})
1027
+
1028
+ except WebSocketDisconnect:
1029
+ logger.info("PubSub WebSocket disconnected")
1030
+ except Exception as e:
1031
+ logger.error(f"PubSub WebSocket error: {e}")
api/pubsub.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WayyDB PubSub Abstraction Layer
3
+
4
+ Provides a pluggable pub/sub transport for real-time tick distribution.
5
+ Two backends:
6
+ - InMemoryPubSub: Default, zero dependencies, single-process
7
+ - RedisPubSub: Optional, requires redis-py, multi-process capable
8
+
9
+ Configure via REDIS_URL environment variable:
10
+ - Not set or empty: uses InMemoryPubSub
11
+ - Set to redis://...: uses RedisPubSub
12
+
13
+ Channel naming convention:
14
+ ticks:{symbol} - Trade ticks for a symbol
15
+ quotes:{symbol} - Quote updates for a symbol
16
+ ticks:* - All trade ticks
17
+ {table}:{symbol} - Generic table:symbol pattern
18
+ """
19
+
20
+ import asyncio
21
+ import logging
22
+ import time
23
+ from abc import ABC, abstractmethod
24
+ from collections import defaultdict, deque
25
+ from dataclasses import dataclass, field
26
+ from typing import Any, Callable, Coroutine, Dict, List, Optional, Set
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Type alias for async callback
31
+ AsyncCallback = Callable[[dict], Coroutine[Any, Any, None]]
32
+
33
+
34
+ @dataclass
35
+ class Message:
36
+ """A pub/sub message with metadata."""
37
+ channel: str
38
+ data: dict
39
+ sequence: int
40
+ timestamp: float = field(default_factory=time.time)
41
+
42
+
43
+ class PubSubBackend(ABC):
44
+ """Abstract pub/sub backend interface.
45
+
46
+ Implementations must provide publish, subscribe, and unsubscribe.
47
+ This abstraction allows swapping between in-memory, Redis, NATS, etc.
48
+ """
49
+
50
+ @abstractmethod
51
+ async def publish(self, channel: str, data: dict) -> int:
52
+ """Publish a message to a channel.
53
+
54
+ Args:
55
+ channel: Channel name (e.g., "ticks:AAPL")
56
+ data: Message payload
57
+
58
+ Returns:
59
+ Sequence number of the published message
60
+ """
61
+ ...
62
+
63
+ @abstractmethod
64
+ async def subscribe(
65
+ self,
66
+ channel: str,
67
+ callback: AsyncCallback,
68
+ subscriber_id: str = "",
69
+ ) -> None:
70
+ """Subscribe to a channel with a callback.
71
+
72
+ Args:
73
+ channel: Channel name or pattern (e.g., "ticks:AAPL" or "ticks:*")
74
+ callback: Async function called with each message dict
75
+ subscriber_id: Unique identifier for this subscriber
76
+ """
77
+ ...
78
+
79
+ @abstractmethod
80
+ async def unsubscribe(self, channel: str, subscriber_id: str = "") -> None:
81
+ """Unsubscribe from a channel.
82
+
83
+ Args:
84
+ channel: Channel name or pattern
85
+ subscriber_id: The subscriber to remove
86
+ """
87
+ ...
88
+
89
+ @abstractmethod
90
+ async def publish_batch(self, channel: str, messages: List[dict]) -> int:
91
+ """Publish a batch of messages to a channel.
92
+
93
+ Args:
94
+ channel: Channel name
95
+ messages: List of message payloads
96
+
97
+ Returns:
98
+ Sequence number of the last message
99
+ """
100
+ ...
101
+
102
+ @abstractmethod
103
+ def get_stats(self) -> dict:
104
+ """Get pub/sub statistics."""
105
+ ...
106
+
107
+ @abstractmethod
108
+ async def start(self) -> None:
109
+ """Start the backend (connect, initialize)."""
110
+ ...
111
+
112
+ @abstractmethod
113
+ async def stop(self) -> None:
114
+ """Stop the backend (disconnect, cleanup)."""
115
+ ...
116
+
117
+
118
+ class InMemoryPubSub(PubSubBackend):
119
+ """In-process pub/sub using asyncio.
120
+
121
+ Features:
122
+ - Channel-based routing with wildcard support
123
+ - Per-channel sequence numbers
124
+ - Ring buffer for backpressure (drops oldest on overflow)
125
+ - Concurrent broadcast via asyncio.gather
126
+ - Message replay from buffer
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ max_buffer_per_channel: int = 10000,
132
+ broadcast_timeout: float = 5.0,
133
+ ):
134
+ self._subscribers: Dict[str, Dict[str, AsyncCallback]] = defaultdict(dict)
135
+ self._sequence: Dict[str, int] = defaultdict(int)
136
+ self._buffers: Dict[str, deque] = {}
137
+ self._max_buffer = max_buffer_per_channel
138
+ self._broadcast_timeout = broadcast_timeout
139
+ self._stats = {
140
+ "messages_published": 0,
141
+ "messages_delivered": 0,
142
+ "messages_dropped": 0,
143
+ "active_subscriptions": 0,
144
+ "channels": 0,
145
+ }
146
+ self._running = False
147
+
148
+ async def start(self) -> None:
149
+ self._running = True
150
+ logger.info("InMemoryPubSub started")
151
+
152
+ async def stop(self) -> None:
153
+ self._running = False
154
+ self._subscribers.clear()
155
+ self._buffers.clear()
156
+ logger.info("InMemoryPubSub stopped")
157
+
158
+ async def publish(self, channel: str, data: dict) -> int:
159
+ self._sequence[channel] += 1
160
+ seq = self._sequence[channel]
161
+
162
+ msg = Message(channel=channel, data=data, sequence=seq)
163
+
164
+ # Buffer the message
165
+ if channel not in self._buffers:
166
+ self._buffers[channel] = deque(maxlen=self._max_buffer)
167
+ buf = self._buffers[channel]
168
+ if len(buf) >= self._max_buffer:
169
+ self._stats["messages_dropped"] += 1
170
+ buf.append(msg)
171
+
172
+ self._stats["messages_published"] += 1
173
+ self._stats["channels"] = len(self._buffers)
174
+
175
+ # Deliver to subscribers
176
+ await self._deliver(channel, data, seq)
177
+
178
+ return seq
179
+
180
+ async def publish_batch(self, channel: str, messages: List[dict]) -> int:
181
+ last_seq = 0
182
+ for data in messages:
183
+ last_seq = await self.publish(channel, data)
184
+ return last_seq
185
+
186
+ async def subscribe(
187
+ self,
188
+ channel: str,
189
+ callback: AsyncCallback,
190
+ subscriber_id: str = "",
191
+ ) -> None:
192
+ if not subscriber_id:
193
+ subscriber_id = f"sub_{id(callback)}"
194
+
195
+ self._subscribers[channel][subscriber_id] = callback
196
+ self._stats["active_subscriptions"] = sum(
197
+ len(subs) for subs in self._subscribers.values()
198
+ )
199
+ logger.debug(f"Subscribed {subscriber_id} to {channel}")
200
+
201
+ async def unsubscribe(self, channel: str, subscriber_id: str = "") -> None:
202
+ if channel in self._subscribers:
203
+ if subscriber_id and subscriber_id in self._subscribers[channel]:
204
+ del self._subscribers[channel][subscriber_id]
205
+ elif not subscriber_id:
206
+ self._subscribers[channel].clear()
207
+
208
+ if not self._subscribers[channel]:
209
+ del self._subscribers[channel]
210
+
211
+ self._stats["active_subscriptions"] = sum(
212
+ len(subs) for subs in self._subscribers.values()
213
+ )
214
+
215
+ async def _deliver(self, channel: str, data: dict, sequence: int) -> None:
216
+ """Deliver message to all matching subscribers concurrently."""
217
+ enriched = {**data, "_seq": sequence, "_channel": channel}
218
+
219
+ # Collect all matching callbacks
220
+ callbacks: List[AsyncCallback] = []
221
+
222
+ # Exact match subscribers
223
+ if channel in self._subscribers:
224
+ callbacks.extend(self._subscribers[channel].values())
225
+
226
+ # Wildcard subscribers (e.g., "ticks:*" matches "ticks:AAPL")
227
+ for pattern, subs in self._subscribers.items():
228
+ if pattern.endswith(":*"):
229
+ prefix = pattern[:-1] # "ticks:"
230
+ if channel.startswith(prefix) and pattern != channel:
231
+ callbacks.extend(subs.values())
232
+
233
+ if not callbacks:
234
+ return
235
+
236
+ # Concurrent delivery with timeout
237
+ dead_callbacks: List[AsyncCallback] = []
238
+
239
+ async def safe_deliver(cb: AsyncCallback) -> None:
240
+ try:
241
+ await asyncio.wait_for(cb(enriched), timeout=self._broadcast_timeout)
242
+ self._stats["messages_delivered"] += 1
243
+ except asyncio.TimeoutError:
244
+ logger.warning(f"Subscriber timed out on {channel}")
245
+ dead_callbacks.append(cb)
246
+ except Exception:
247
+ dead_callbacks.append(cb)
248
+
249
+ await asyncio.gather(*(safe_deliver(cb) for cb in callbacks))
250
+
251
+ # Remove dead subscribers
252
+ for dead_cb in dead_callbacks:
253
+ for pattern, subs in list(self._subscribers.items()):
254
+ to_remove = [
255
+ sid for sid, cb in subs.items() if cb is dead_cb
256
+ ]
257
+ for sid in to_remove:
258
+ del subs[sid]
259
+ logger.debug(f"Removed dead subscriber {sid} from {pattern}")
260
+
261
+ if dead_callbacks:
262
+ self._stats["active_subscriptions"] = sum(
263
+ len(subs) for subs in self._subscribers.values()
264
+ )
265
+
266
+ def get_channel_buffer(self, channel: str, since_seq: int = 0) -> List[Message]:
267
+ """Get buffered messages for replay.
268
+
269
+ Args:
270
+ channel: Channel name
271
+ since_seq: Only return messages with sequence > since_seq
272
+
273
+ Returns:
274
+ List of messages for replay
275
+ """
276
+ if channel not in self._buffers:
277
+ return []
278
+ return [m for m in self._buffers[channel] if m.sequence > since_seq]
279
+
280
+ def get_stats(self) -> dict:
281
+ return {
282
+ "backend": "in_memory",
283
+ **self._stats,
284
+ "buffer_sizes": {ch: len(buf) for ch, buf in self._buffers.items()},
285
+ }
286
+
287
+
288
+ class RedisPubSub(PubSubBackend):
289
+ """Redis-backed pub/sub for multi-process deployments.
290
+
291
+ Uses Redis pub/sub for real-time delivery and Redis Streams
292
+ for message persistence and replay.
293
+
294
+ Requires: pip install redis[hiredis]
295
+ Configure via REDIS_URL environment variable.
296
+ """
297
+
298
+ def __init__(self, redis_url: str, max_stream_len: int = 100000):
299
+ self._redis_url = redis_url
300
+ self._max_stream_len = max_stream_len
301
+ self._redis = None
302
+ self._pubsub = None
303
+ self._subscribers: Dict[str, Dict[str, AsyncCallback]] = defaultdict(dict)
304
+ self._sequence: Dict[str, int] = defaultdict(int)
305
+ self._listener_task: Optional[asyncio.Task] = None
306
+ self._running = False
307
+ self._stats = {
308
+ "messages_published": 0,
309
+ "messages_delivered": 0,
310
+ "messages_dropped": 0,
311
+ "active_subscriptions": 0,
312
+ "channels": 0,
313
+ "redis_connected": False,
314
+ }
315
+
316
+ async def start(self) -> None:
317
+ try:
318
+ import redis.asyncio as aioredis
319
+ except ImportError:
320
+ raise ImportError(
321
+ "redis package required for RedisPubSub. "
322
+ "Install with: pip install redis[hiredis]"
323
+ )
324
+
325
+ self._redis = aioredis.from_url(
326
+ self._redis_url,
327
+ decode_responses=True,
328
+ socket_connect_timeout=5,
329
+ retry_on_timeout=True,
330
+ )
331
+
332
+ # Test connection
333
+ await self._redis.ping()
334
+ self._stats["redis_connected"] = True
335
+
336
+ self._pubsub = self._redis.pubsub()
337
+ self._running = True
338
+ self._listener_task = asyncio.create_task(self._listen_loop())
339
+
340
+ logger.info(f"RedisPubSub connected to {self._redis_url}")
341
+
342
+ async def stop(self) -> None:
343
+ self._running = False
344
+
345
+ if self._listener_task:
346
+ self._listener_task.cancel()
347
+ try:
348
+ await self._listener_task
349
+ except asyncio.CancelledError:
350
+ pass
351
+
352
+ if self._pubsub:
353
+ await self._pubsub.unsubscribe()
354
+ await self._pubsub.close()
355
+
356
+ if self._redis:
357
+ await self._redis.close()
358
+
359
+ self._stats["redis_connected"] = False
360
+ logger.info("RedisPubSub stopped")
361
+
362
+ async def publish(self, channel: str, data: dict) -> int:
363
+ import json
364
+
365
+ self._sequence[channel] += 1
366
+ seq = self._sequence[channel]
367
+
368
+ enriched = {**data, "_seq": seq, "_ts": time.time()}
369
+ payload = json.dumps(enriched)
370
+
371
+ # Publish to Redis pub/sub channel
372
+ await self._redis.publish(f"wayy:{channel}", payload)
373
+
374
+ # Also write to Redis Stream for persistence/replay
375
+ stream_key = f"wayy:stream:{channel}"
376
+ await self._redis.xadd(
377
+ stream_key,
378
+ {"data": payload},
379
+ maxlen=self._max_stream_len,
380
+ )
381
+
382
+ self._stats["messages_published"] += 1
383
+ return seq
384
+
385
+ async def publish_batch(self, channel: str, messages: List[dict]) -> int:
386
+ import json
387
+
388
+ pipe = self._redis.pipeline()
389
+ last_seq = 0
390
+
391
+ for data in messages:
392
+ self._sequence[channel] += 1
393
+ seq = self._sequence[channel]
394
+ last_seq = seq
395
+
396
+ enriched = {**data, "_seq": seq, "_ts": time.time()}
397
+ payload = json.dumps(enriched)
398
+
399
+ pipe.publish(f"wayy:{channel}", payload)
400
+
401
+ stream_key = f"wayy:stream:{channel}"
402
+ pipe.xadd(stream_key, {"data": payload}, maxlen=self._max_stream_len)
403
+
404
+ await pipe.execute()
405
+ self._stats["messages_published"] += len(messages)
406
+ return last_seq
407
+
408
+ async def subscribe(
409
+ self,
410
+ channel: str,
411
+ callback: AsyncCallback,
412
+ subscriber_id: str = "",
413
+ ) -> None:
414
+ if not subscriber_id:
415
+ subscriber_id = f"sub_{id(callback)}"
416
+
417
+ is_new_channel = channel not in self._subscribers or not self._subscribers[channel]
418
+ self._subscribers[channel][subscriber_id] = callback
419
+
420
+ if is_new_channel and self._pubsub:
421
+ if channel.endswith(":*"):
422
+ await self._pubsub.psubscribe(f"wayy:{channel}")
423
+ else:
424
+ await self._pubsub.subscribe(f"wayy:{channel}")
425
+
426
+ self._stats["active_subscriptions"] = sum(
427
+ len(subs) for subs in self._subscribers.values()
428
+ )
429
+ self._stats["channels"] = len(self._subscribers)
430
+
431
+ async def unsubscribe(self, channel: str, subscriber_id: str = "") -> None:
432
+ if channel in self._subscribers:
433
+ if subscriber_id and subscriber_id in self._subscribers[channel]:
434
+ del self._subscribers[channel][subscriber_id]
435
+ elif not subscriber_id:
436
+ self._subscribers[channel].clear()
437
+
438
+ if not self._subscribers[channel]:
439
+ del self._subscribers[channel]
440
+ if self._pubsub:
441
+ if channel.endswith(":*"):
442
+ await self._pubsub.punsubscribe(f"wayy:{channel}")
443
+ else:
444
+ await self._pubsub.unsubscribe(f"wayy:{channel}")
445
+
446
+ self._stats["active_subscriptions"] = sum(
447
+ len(subs) for subs in self._subscribers.values()
448
+ )
449
+
450
+ async def _listen_loop(self) -> None:
451
+ """Background task that listens for Redis pub/sub messages."""
452
+ import json
453
+
454
+ while self._running:
455
+ try:
456
+ message = await self._pubsub.get_message(
457
+ ignore_subscribe_messages=True, timeout=0.1
458
+ )
459
+ if message is None:
460
+ await asyncio.sleep(0.01)
461
+ continue
462
+
463
+ if message["type"] not in ("message", "pmessage"):
464
+ continue
465
+
466
+ raw_channel = message.get("channel", "")
467
+ # Strip "wayy:" prefix
468
+ if raw_channel.startswith("wayy:"):
469
+ channel = raw_channel[5:]
470
+ else:
471
+ channel = raw_channel
472
+
473
+ data = json.loads(message["data"])
474
+
475
+ # Deliver to local subscribers
476
+ await self._deliver_local(channel, data)
477
+
478
+ except asyncio.CancelledError:
479
+ raise
480
+ except Exception as e:
481
+ logger.error(f"Redis listener error: {e}")
482
+ await asyncio.sleep(1.0)
483
+
484
+ async def _deliver_local(self, channel: str, data: dict) -> None:
485
+ """Deliver a received message to local subscribers."""
486
+ callbacks: List[AsyncCallback] = []
487
+
488
+ if channel in self._subscribers:
489
+ callbacks.extend(self._subscribers[channel].values())
490
+
491
+ # Wildcard matching
492
+ for pattern, subs in self._subscribers.items():
493
+ if pattern.endswith(":*"):
494
+ prefix = pattern[:-1]
495
+ if channel.startswith(prefix) and pattern != channel:
496
+ callbacks.extend(subs.values())
497
+
498
+ for cb in callbacks:
499
+ try:
500
+ await asyncio.wait_for(cb(data), timeout=5.0)
501
+ self._stats["messages_delivered"] += 1
502
+ except Exception:
503
+ self._stats["messages_dropped"] += 1
504
+
505
+ async def replay(
506
+ self, channel: str, since_id: str = "0-0", count: int = 1000
507
+ ) -> List[dict]:
508
+ """Replay messages from Redis Stream.
509
+
510
+ Args:
511
+ channel: Channel name
512
+ since_id: Redis Stream ID to start from
513
+ count: Maximum messages to return
514
+
515
+ Returns:
516
+ List of message dicts
517
+ """
518
+ import json
519
+
520
+ stream_key = f"wayy:stream:{channel}"
521
+ messages = await self._redis.xrange(stream_key, min=since_id, count=count)
522
+
523
+ return [json.loads(entry["data"]) for _id, entry in messages]
524
+
525
+ def get_stats(self) -> dict:
526
+ return {
527
+ "backend": "redis",
528
+ "redis_url": self._redis_url.split("@")[-1] if "@" in self._redis_url else self._redis_url,
529
+ **self._stats,
530
+ }
531
+
532
+
533
+ def create_pubsub(redis_url: Optional[str] = None) -> PubSubBackend:
534
+ """Factory function to create the appropriate PubSub backend.
535
+
536
+ Args:
537
+ redis_url: Redis URL. If None/empty, uses InMemoryPubSub.
538
+
539
+ Returns:
540
+ PubSubBackend instance
541
+ """
542
+ if redis_url:
543
+ logger.info(f"Using RedisPubSub backend")
544
+ return RedisPubSub(redis_url=redis_url)
545
+ else:
546
+ logger.info("Using InMemoryPubSub backend (set REDIS_URL for Redis)")
547
+ return InMemoryPubSub()
api/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi>=0.109.0
2
+ uvicorn[standard]>=0.27.0
3
+ numpy>=1.20
4
+ pydantic>=2.0
5
+ websockets>=12.0
6
+ redis[hiredis]>=5.0
api/streaming.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WayyDB Streaming Module - Real-time data ingestion and pub/sub
3
+
4
+ Provides:
5
+ - WebSocket ingestion endpoint for real-time tick data
6
+ - Pub/Sub subscriptions via pluggable backend (in-memory or Redis)
7
+ - Efficient batching and append operations
8
+ - In-memory buffers with periodic flush to persistent storage
9
+ - Backpressure handling and sequence numbers
10
+
11
+ Configuration via environment variables:
12
+ - FLUSH_INTERVAL: Seconds between flushes to disk (default: 1.0)
13
+ - MAX_BUFFER_SIZE: Max ticks in buffer before force flush (default: 10000)
14
+ - BROADCAST_INTERVAL: Seconds between subscriber broadcasts (default: 0.05)
15
+ - REDIS_URL: Optional Redis URL for distributed pub/sub
16
+ """
17
+
18
+ import asyncio
19
+ import logging
20
+ import os
21
+ import threading
22
+ import time
23
+ from collections import defaultdict
24
+ from dataclasses import dataclass, field
25
+ from datetime import datetime, timezone
26
+ from typing import Any, Dict, List, Optional, Set
27
+
28
+ import numpy as np
29
+ from fastapi import WebSocket
30
+
31
+ from api.pubsub import PubSubBackend, create_pubsub
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Configuration from environment
36
+ DEFAULT_FLUSH_INTERVAL = float(os.getenv("FLUSH_INTERVAL", "1.0"))
37
+ DEFAULT_MAX_BUFFER_SIZE = int(os.getenv("MAX_BUFFER_SIZE", "10000"))
38
+ DEFAULT_BROADCAST_INTERVAL = float(os.getenv("BROADCAST_INTERVAL", "0.05"))
39
+
40
+
41
+ @dataclass
42
+ class TickBuffer:
43
+ """Buffer for incoming tick data before flush to table."""
44
+ timestamps: List[int] = field(default_factory=list)
45
+ symbols: List[str] = field(default_factory=list)
46
+ prices: List[float] = field(default_factory=list)
47
+ volumes: List[float] = field(default_factory=list)
48
+ bids: List[float] = field(default_factory=list)
49
+ asks: List[float] = field(default_factory=list)
50
+
51
+ def append(self, timestamp: int, symbol: str, price: float,
52
+ volume: float = 0.0, bid: float = 0.0, ask: float = 0.0):
53
+ self.timestamps.append(timestamp)
54
+ self.symbols.append(symbol)
55
+ self.prices.append(price)
56
+ self.volumes.append(volume)
57
+ self.bids.append(bid if bid else price)
58
+ self.asks.append(ask if ask else price)
59
+
60
+ def __len__(self):
61
+ return len(self.timestamps)
62
+
63
+ def clear(self):
64
+ self.timestamps.clear()
65
+ self.symbols.clear()
66
+ self.prices.clear()
67
+ self.volumes.clear()
68
+ self.bids.clear()
69
+ self.asks.clear()
70
+
71
+ def to_columnar(self) -> Dict[str, np.ndarray]:
72
+ """Convert to columnar format for WayyDB."""
73
+ return {
74
+ "timestamp": np.array(self.timestamps, dtype=np.int64),
75
+ "symbol": np.array([hash(s) % (2**32) for s in self.symbols], dtype=np.uint32),
76
+ "price": np.array(self.prices, dtype=np.float64),
77
+ "volume": np.array(self.volumes, dtype=np.float64),
78
+ "bid": np.array(self.bids, dtype=np.float64),
79
+ "ask": np.array(self.asks, dtype=np.float64),
80
+ }
81
+
82
+
83
+ @dataclass
84
+ class Subscriber:
85
+ """A WebSocket subscriber to data updates."""
86
+ websocket: WebSocket
87
+ symbols: Set[str] = field(default_factory=set) # Empty = all symbols
88
+ subscriber_id: str = ""
89
+ created_at: float = field(default_factory=time.time)
90
+ messages_sent: int = 0
91
+
92
+
93
+ class StreamingManager:
94
+ """
95
+ Manages streaming data ingestion and pub/sub distribution.
96
+
97
+ Features:
98
+ - Buffer incoming ticks in memory
99
+ - Publish to PubSub channels (in-memory or Redis)
100
+ - Broadcast to WebSocket subscribers via PubSub callbacks
101
+ - Periodic flush to WayyDB tables (atomic swap, no gap)
102
+ - Thread-safe operations via threading.Lock
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ flush_interval: float = DEFAULT_FLUSH_INTERVAL,
108
+ max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
109
+ batch_broadcast_interval: float = DEFAULT_BROADCAST_INTERVAL,
110
+ pubsub: Optional[PubSubBackend] = None,
111
+ ):
112
+ self.flush_interval = flush_interval
113
+ self.max_buffer_size = max_buffer_size
114
+ self.batch_broadcast_interval = batch_broadcast_interval
115
+
116
+ # PubSub backend (in-memory default, Redis optional)
117
+ self._pubsub = pubsub
118
+
119
+ # Tick buffers - one per table
120
+ self._buffers: Dict[str, TickBuffer] = defaultdict(TickBuffer)
121
+
122
+ # WebSocket subscribers - one list per table
123
+ self._subscribers: Dict[str, List[Subscriber]] = defaultdict(list)
124
+
125
+ # Latest quotes cache (for new subscribers)
126
+ self._latest_quotes: Dict[str, Dict[str, Any]] = {}
127
+
128
+ # Pending broadcasts (batched for efficiency)
129
+ self._pending_broadcasts: Dict[str, List[Dict]] = defaultdict(list)
130
+
131
+ # Statistics
132
+ self._stats = {
133
+ "ticks_received": 0,
134
+ "ticks_flushed": 0,
135
+ "broadcasts_sent": 0,
136
+ "active_subscribers": 0,
137
+ "flush_count": 0,
138
+ "start_time": None,
139
+ }
140
+
141
+ # Background tasks
142
+ self._running = False
143
+ self._flush_task: Optional[asyncio.Task] = None
144
+ self._broadcast_task: Optional[asyncio.Task] = None
145
+
146
+ # Database reference (set by API)
147
+ self._db = None
148
+
149
+ # FIX: Use threading.Lock for thread safety with ThreadPoolExecutor
150
+ self._lock = threading.Lock()
151
+
152
+ def set_database(self, db):
153
+ """Set the database reference for flushing."""
154
+ self._db = db
155
+
156
+ def set_pubsub(self, pubsub: PubSubBackend):
157
+ """Set the pub/sub backend."""
158
+ self._pubsub = pubsub
159
+
160
+ async def start(self):
161
+ """Start background flush and broadcast tasks."""
162
+ if self._running:
163
+ return
164
+
165
+ self._running = True
166
+ self._stats["start_time"] = datetime.now(timezone.utc).isoformat()
167
+
168
+ # Start PubSub backend if provided
169
+ if self._pubsub:
170
+ await self._pubsub.start()
171
+
172
+ self._flush_task = asyncio.create_task(self._flush_loop())
173
+ self._broadcast_task = asyncio.create_task(self._broadcast_loop())
174
+
175
+ logger.info("StreamingManager started")
176
+
177
+ async def stop(self):
178
+ """Stop background tasks and flush remaining data."""
179
+ if not self._running:
180
+ return
181
+
182
+ self._running = False
183
+
184
+ if self._flush_task:
185
+ self._flush_task.cancel()
186
+ try:
187
+ await self._flush_task
188
+ except asyncio.CancelledError:
189
+ pass
190
+
191
+ if self._broadcast_task:
192
+ self._broadcast_task.cancel()
193
+ try:
194
+ await self._broadcast_task
195
+ except asyncio.CancelledError:
196
+ pass
197
+
198
+ # Final flush
199
+ await self._flush_all()
200
+
201
+ # Stop PubSub backend
202
+ if self._pubsub:
203
+ await self._pubsub.stop()
204
+
205
+ logger.info("StreamingManager stopped")
206
+
207
+ async def ingest_tick(
208
+ self,
209
+ table: str,
210
+ symbol: str,
211
+ price: float,
212
+ timestamp: Optional[int] = None,
213
+ volume: float = 0.0,
214
+ bid: float = 0.0,
215
+ ask: float = 0.0,
216
+ ):
217
+ """Ingest a single tick."""
218
+ if timestamp is None:
219
+ timestamp = int(datetime.now(timezone.utc).timestamp() * 1e9)
220
+
221
+ # Add to buffer (thread-safe)
222
+ with self._lock:
223
+ self._buffers[table].append(
224
+ timestamp=timestamp,
225
+ symbol=symbol,
226
+ price=price,
227
+ volume=volume,
228
+ bid=bid,
229
+ ask=ask,
230
+ )
231
+ self._stats["ticks_received"] += 1
232
+
233
+ # Build quote message
234
+ quote = {
235
+ "symbol": symbol,
236
+ "price": price,
237
+ "bid": bid or price,
238
+ "ask": ask or price,
239
+ "volume": volume,
240
+ "timestamp": timestamp,
241
+ "table": table,
242
+ }
243
+ self._latest_quotes[f"{table}:{symbol}"] = quote
244
+
245
+ # Publish to PubSub channel
246
+ if self._pubsub:
247
+ channel = f"{table}:{symbol}"
248
+ await self._pubsub.publish(channel, quote)
249
+
250
+ # Queue for WebSocket broadcast
251
+ self._pending_broadcasts[table].append(quote)
252
+
253
+ # Force flush if buffer too large
254
+ if len(self._buffers[table]) >= self.max_buffer_size:
255
+ await self._flush_table(table)
256
+
257
+ async def ingest_batch(
258
+ self,
259
+ table: str,
260
+ ticks: List[Dict[str, Any]],
261
+ ):
262
+ """Ingest a batch of ticks efficiently."""
263
+ quotes_by_channel: Dict[str, List[dict]] = defaultdict(list)
264
+
265
+ with self._lock:
266
+ buffer = self._buffers[table]
267
+ for tick in ticks:
268
+ timestamp = tick.get("timestamp")
269
+ if timestamp is None:
270
+ timestamp = int(datetime.now(timezone.utc).timestamp() * 1e9)
271
+
272
+ buffer.append(
273
+ timestamp=timestamp,
274
+ symbol=tick["symbol"],
275
+ price=tick["price"],
276
+ volume=tick.get("volume", 0.0),
277
+ bid=tick.get("bid", tick["price"]),
278
+ ask=tick.get("ask", tick["price"]),
279
+ )
280
+
281
+ quote = {
282
+ "symbol": tick["symbol"],
283
+ "price": tick["price"],
284
+ "bid": tick.get("bid", tick["price"]),
285
+ "ask": tick.get("ask", tick["price"]),
286
+ "volume": tick.get("volume", 0.0),
287
+ "timestamp": timestamp,
288
+ "table": table,
289
+ }
290
+ self._latest_quotes[f"{table}:{tick['symbol']}"] = quote
291
+ self._pending_broadcasts[table].append(quote)
292
+
293
+ channel = f"{table}:{tick['symbol']}"
294
+ quotes_by_channel[channel].append(quote)
295
+
296
+ self._stats["ticks_received"] += len(ticks)
297
+
298
+ # Batch publish to PubSub channels
299
+ if self._pubsub:
300
+ for channel, channel_quotes in quotes_by_channel.items():
301
+ await self._pubsub.publish_batch(channel, channel_quotes)
302
+
303
+ # Force flush if buffer too large
304
+ if len(self._buffers[table]) >= self.max_buffer_size:
305
+ await self._flush_table(table)
306
+
307
+ async def subscribe(self, websocket: WebSocket, table: str, symbols: Optional[List[str]] = None):
308
+ """Add a WebSocket subscriber to a table's updates."""
309
+ sub_id = f"ws_{id(websocket)}"
310
+ subscriber = Subscriber(
311
+ websocket=websocket,
312
+ symbols=set(symbols) if symbols else set(),
313
+ subscriber_id=sub_id,
314
+ )
315
+
316
+ self._subscribers[table].append(subscriber)
317
+ self._stats["active_subscribers"] = sum(len(s) for s in self._subscribers.values())
318
+
319
+ # Send current latest quotes to new subscriber
320
+ for key, quote in self._latest_quotes.items():
321
+ if key.startswith(f"{table}:"):
322
+ symbol = key.split(":", 1)[1]
323
+ if not subscriber.symbols or symbol in subscriber.symbols:
324
+ try:
325
+ await websocket.send_json(quote)
326
+ except Exception:
327
+ pass
328
+
329
+ logger.info(f"New subscriber for {table}, symbols={symbols or 'all'}")
330
+ return subscriber
331
+
332
+ async def unsubscribe(self, websocket: WebSocket, table: str):
333
+ """Remove a subscriber."""
334
+ self._subscribers[table] = [
335
+ s for s in self._subscribers[table]
336
+ if s.websocket != websocket
337
+ ]
338
+ self._stats["active_subscribers"] = sum(len(s) for s in self._subscribers.values())
339
+
340
+ async def _flush_loop(self):
341
+ """Background task to periodically flush buffers."""
342
+ while self._running:
343
+ try:
344
+ await asyncio.sleep(self.flush_interval)
345
+ await self._flush_all()
346
+ except asyncio.CancelledError:
347
+ raise
348
+ except Exception as e:
349
+ logger.error(f"Flush error: {e}")
350
+
351
+ async def _flush_all(self):
352
+ """Flush all buffers to database."""
353
+ with self._lock:
354
+ tables = list(self._buffers.keys())
355
+
356
+ for table in tables:
357
+ await self._flush_table(table)
358
+
359
+ async def _flush_table(self, table: str):
360
+ """Flush a single table's buffer to database.
361
+
362
+ FIX: Atomic table swap - build new table first, then replace.
363
+ The old table remains readable until the swap completes.
364
+ """
365
+ if self._db is None:
366
+ return
367
+
368
+ with self._lock:
369
+ buffer = self._buffers[table]
370
+ if len(buffer) == 0:
371
+ return
372
+
373
+ # Get columnar data and clear buffer
374
+ data = buffer.to_columnar()
375
+ count = len(buffer)
376
+ buffer.clear()
377
+
378
+ try:
379
+ import wayy_db as wdb
380
+
381
+ if self._db.has_table(table):
382
+ existing = self._db[table]
383
+
384
+ # Read existing data
385
+ existing_data = {}
386
+ for col_name in existing.column_names():
387
+ existing_data[col_name] = existing[col_name].to_numpy()
388
+
389
+ # Concatenate
390
+ combined = {}
391
+ for col_name, new_arr in data.items():
392
+ if col_name in existing_data:
393
+ combined[col_name] = np.concatenate([existing_data[col_name], new_arr])
394
+ else:
395
+ combined[col_name] = new_arr
396
+
397
+ # FIX: Build new table FIRST, then atomic swap
398
+ new_table = wdb.from_dict(combined, name=table, sorted_by="timestamp")
399
+ self._db.drop_table(table)
400
+ self._db.add_table(new_table)
401
+ else:
402
+ new_table = wdb.from_dict(data, name=table, sorted_by="timestamp")
403
+ self._db.add_table(new_table)
404
+
405
+ self._db.save()
406
+
407
+ self._stats["ticks_flushed"] += count
408
+ self._stats["flush_count"] += 1
409
+
410
+ logger.debug(f"Flushed {count} ticks to {table}")
411
+
412
+ except Exception as e:
413
+ logger.error(f"Failed to flush {table}: {e}")
414
+ # Re-add data to buffer on failure
415
+ with self._lock:
416
+ buf = self._buffers[table]
417
+ for i in range(len(data["timestamp"])):
418
+ buf.timestamps.append(int(data["timestamp"][i]))
419
+ buf.symbols.append(f"unknown") # Symbol hash lost, but data preserved
420
+ buf.prices.append(float(data["price"][i]))
421
+ buf.volumes.append(float(data["volume"][i]))
422
+ buf.bids.append(float(data["bid"][i]))
423
+ buf.asks.append(float(data["ask"][i]))
424
+
425
+ async def _broadcast_loop(self):
426
+ """Background task to batch-broadcast updates to WebSocket subscribers."""
427
+ while self._running:
428
+ try:
429
+ await asyncio.sleep(self.batch_broadcast_interval)
430
+ await self._broadcast_pending()
431
+ except asyncio.CancelledError:
432
+ raise
433
+ except Exception as e:
434
+ logger.error(f"Broadcast error: {e}")
435
+
436
+ async def _broadcast_pending(self):
437
+ """Broadcast pending updates to all subscribers.
438
+
439
+ FIX: Uses asyncio.gather for concurrent WebSocket sends.
440
+ One slow subscriber no longer blocks all others.
441
+ """
442
+ # Swap out pending broadcasts atomically
443
+ pending = dict(self._pending_broadcasts)
444
+ self._pending_broadcasts = defaultdict(list)
445
+
446
+ for table, quotes in pending.items():
447
+ if not quotes:
448
+ continue
449
+
450
+ subscribers = self._subscribers.get(table, [])
451
+ if not subscribers:
452
+ continue
453
+
454
+ # Build send tasks for all subscribers concurrently
455
+ send_tasks = []
456
+ sub_task_map: List[Subscriber] = []
457
+
458
+ for sub in subscribers:
459
+ if sub.symbols:
460
+ filtered = [q for q in quotes if q["symbol"] in sub.symbols]
461
+ else:
462
+ filtered = quotes
463
+
464
+ if not filtered:
465
+ continue
466
+
467
+ if len(filtered) == 1:
468
+ payload = filtered[0]
469
+ else:
470
+ payload = {"batch": filtered}
471
+
472
+ send_tasks.append(self._safe_send(sub.websocket, payload))
473
+ sub_task_map.append(sub)
474
+
475
+ if not send_tasks:
476
+ continue
477
+
478
+ # FIX: Concurrent sends via asyncio.gather
479
+ results = await asyncio.gather(*send_tasks, return_exceptions=True)
480
+
481
+ dead_subs = []
482
+ for sub, result in zip(sub_task_map, results):
483
+ if isinstance(result, Exception):
484
+ dead_subs.append(sub)
485
+ else:
486
+ count = len(quotes) if not sub.symbols else len(
487
+ [q for q in quotes if q["symbol"] in sub.symbols]
488
+ )
489
+ sub.messages_sent += count
490
+ self._stats["broadcasts_sent"] += count
491
+
492
+ # Remove dead subscribers
493
+ for sub in dead_subs:
494
+ if sub in self._subscribers[table]:
495
+ self._subscribers[table].remove(sub)
496
+
497
+ @staticmethod
498
+ async def _safe_send(websocket: WebSocket, payload: Any) -> None:
499
+ """Send JSON to a WebSocket with timeout."""
500
+ await asyncio.wait_for(websocket.send_json(payload), timeout=5.0)
501
+
502
+ def get_stats(self) -> Dict[str, Any]:
503
+ """Get streaming statistics."""
504
+ stats = {
505
+ **self._stats,
506
+ "buffer_sizes": {t: len(b) for t, b in self._buffers.items()},
507
+ "subscriber_counts": {t: len(s) for t, s in self._subscribers.items()},
508
+ "latest_quotes": len(self._latest_quotes),
509
+ "running": self._running,
510
+ }
511
+ if self._pubsub:
512
+ stats["pubsub"] = self._pubsub.get_stats()
513
+ return stats
514
+
515
+ def get_latest_quote(self, table: str, symbol: str) -> Optional[Dict[str, Any]]:
516
+ """Get the latest quote for a symbol."""
517
+ return self._latest_quotes.get(f"{table}:{symbol}")
518
+
519
+ def get_all_quotes(self, table: str) -> Dict[str, Dict[str, Any]]:
520
+ """Get all latest quotes for a table."""
521
+ prefix = f"{table}:"
522
+ return {
523
+ k.split(":", 1)[1]: v
524
+ for k, v in self._latest_quotes.items()
525
+ if k.startswith(prefix)
526
+ }
527
+
528
+
529
+ # Global streaming manager instance
530
+ _streaming_manager: Optional[StreamingManager] = None
531
+
532
+
533
+ def get_streaming_manager() -> StreamingManager:
534
+ """Get or create the global streaming manager."""
535
+ global _streaming_manager
536
+ if _streaming_manager is None:
537
+ redis_url = os.getenv("REDIS_URL", "")
538
+ pubsub = create_pubsub(redis_url if redis_url else None)
539
+ _streaming_manager = StreamingManager(pubsub=pubsub)
540
+ return _streaming_manager
541
+
542
+
543
+ async def start_streaming():
544
+ """Start the global streaming manager."""
545
+ manager = get_streaming_manager()
546
+ await manager.start()
547
+
548
+
549
+ async def stop_streaming():
550
+ """Stop the global streaming manager."""
551
+ global _streaming_manager
552
+ if _streaming_manager:
553
+ await _streaming_manager.stop()
build/_deps/googletest-src ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
build/_deps/pybind11-src ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit a2e59f0e7065404b44dfe92a28aca47ba1378dc4
dist/wayy_db-0.1.0-cp310-cp310-linux_x86_64.whl ADDED
Binary file (8.43 kB). View file
 
include/wayy_db/column.hpp ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/types.hpp"
4
+ #include "wayy_db/column_view.hpp"
5
+
6
+ #include <bit>
7
+ #include <memory>
8
+ #include <string>
9
+ #include <variant>
10
+ #include <vector>
11
+
12
+ namespace wayy_db {
13
+
14
+ /// Type-erased column that owns its data or references mmap'd memory
15
+ class Column {
16
+ public:
17
+ /// Construct an empty column
18
+ Column() = default;
19
+
20
+ /// Construct a column with owned data
21
+ Column(std::string name, DType dtype, std::vector<uint8_t> data);
22
+
23
+ /// Construct a column referencing external memory (e.g., mmap)
24
+ Column(std::string name, DType dtype, void* data, size_t size, bool owns_data = false);
25
+
26
+ /// Move-only semantics
27
+ Column(Column&&) = default;
28
+ Column& operator=(Column&&) = default;
29
+ Column(const Column&) = delete;
30
+ Column& operator=(const Column&) = delete;
31
+
32
+ /// Column metadata
33
+ const std::string& name() const { return name_; }
34
+ DType dtype() const { return dtype_; }
35
+ size_t size() const { return size_; }
36
+ size_t byte_size() const { return size_ * dtype_size(dtype_); }
37
+
38
+ /// Raw data access
39
+ void* data() { return data_; }
40
+ const void* data() const { return data_; }
41
+
42
+ /// Typed view access (throws TypeMismatch if wrong type)
43
+ template<typename T>
44
+ ColumnView<T> as();
45
+
46
+ template<typename T>
47
+ ColumnView<const T> as() const;
48
+
49
+ /// Convenience accessors
50
+ Int64View as_int64() { return as<int64_t>(); }
51
+ Float64View as_float64() { return as<double>(); }
52
+ TimestampView as_timestamp() { return as<int64_t>(); }
53
+ SymbolView as_symbol() { return as<uint32_t>(); }
54
+ BoolView as_bool() { return as<uint8_t>(); }
55
+
56
+ /// Decimal6 accessor (underlying int64, but tagged as Decimal6)
57
+ Int64View as_decimal6() {
58
+ if (dtype_ != DType::Decimal6) throw TypeMismatch(DType::Decimal6, dtype_);
59
+ return ColumnView<int64_t>(static_cast<int64_t*>(data_), size_);
60
+ }
61
+
62
+ /// Validity bitmap (null/deleted tracking)
63
+ bool has_validity() const { return has_validity_; }
64
+ void ensure_validity(); // Allocate bitmap, mark all valid
65
+ bool is_valid(size_t row) const;
66
+ void set_valid(size_t row, bool valid);
67
+ size_t count_valid() const; // popcount over bitmap
68
+
69
+ /// Direct access to validity bitmap bytes (for persistence)
70
+ const std::vector<uint8_t>& validity_bitmap() const { return validity_; }
71
+ void set_validity_bitmap(std::vector<uint8_t> bitmap);
72
+
73
+ /// Append a single element (column must own its data)
74
+ void append(const void* value, size_t value_size);
75
+
76
+ /// Overwrite element at row index (column must own its data)
77
+ void set(size_t row, const void* value, size_t value_size);
78
+
79
+ private:
80
+ std::string name_;
81
+ DType dtype_ = DType::Int64;
82
+ void* data_ = nullptr;
83
+ size_t size_ = 0;
84
+ bool owns_data_ = false;
85
+ std::vector<uint8_t> owned_data_; // Storage when we own the data
86
+
87
+ // Validity bitmap: 1 bit per row (bit=1 means valid, bit=0 means null/deleted)
88
+ std::vector<uint8_t> validity_;
89
+ bool has_validity_ = false;
90
+
91
+ /// Check that the requested type matches the column's dtype
92
+ template<typename T>
93
+ void check_type() const;
94
+ };
95
+
96
+ // Template implementations
97
+
98
+ template<typename T>
99
+ ColumnView<T> Column::as() {
100
+ check_type<T>();
101
+ return ColumnView<T>(static_cast<T*>(data_), size_);
102
+ }
103
+
104
+ template<typename T>
105
+ ColumnView<const T> Column::as() const {
106
+ check_type<T>();
107
+ return ColumnView<const T>(static_cast<const T*>(data_), size_);
108
+ }
109
+
110
+ template<typename T>
111
+ void Column::check_type() const {
112
+ using U = std::remove_cv_t<T>;
113
+ DType expected;
114
+ if constexpr (std::is_same_v<U, int64_t>) {
115
+ // Could be Int64, Timestamp, or Decimal6 (all stored as int64_t)
116
+ if (dtype_ != DType::Int64 && dtype_ != DType::Timestamp && dtype_ != DType::Decimal6) {
117
+ throw TypeMismatch(DType::Int64, dtype_);
118
+ }
119
+ return;
120
+ } else if constexpr (std::is_same_v<U, double>) {
121
+ expected = DType::Float64;
122
+ } else if constexpr (std::is_same_v<U, uint32_t>) {
123
+ expected = DType::Symbol;
124
+ } else if constexpr (std::is_same_v<U, uint8_t>) {
125
+ expected = DType::Bool;
126
+ } else {
127
+ static_assert(sizeof(U) == 0, "Unsupported column type");
128
+ }
129
+
130
+ if (dtype_ != expected) {
131
+ throw TypeMismatch(expected, dtype_);
132
+ }
133
+ }
134
+
135
+ } // namespace wayy_db
include/wayy_db/column_view.hpp ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <cstdint>
5
+ #include <span>
6
+ #include <iterator>
7
+
8
+ namespace wayy_db {
9
+
10
+ /// Non-owning typed view over contiguous column data
11
+ /// Provides zero-copy access for SIMD operations and Python bindings
12
+ template<typename T>
13
+ class ColumnView {
14
+ public:
15
+ using value_type = T;
16
+ using size_type = size_t;
17
+ using difference_type = ptrdiff_t;
18
+ using pointer = T*;
19
+ using const_pointer = const T*;
20
+ using reference = T&;
21
+ using const_reference = const T&;
22
+ using iterator = T*;
23
+ using const_iterator = const T*;
24
+
25
+ /// Construct an empty view
26
+ ColumnView() : data_(nullptr), size_(0) {}
27
+
28
+ /// Construct a view over existing data
29
+ ColumnView(T* data, size_t size) : data_(data), size_(size) {}
30
+
31
+ /// Construct from std::span
32
+ explicit ColumnView(std::span<T> span) : data_(span.data()), size_(span.size()) {}
33
+
34
+ // Element access
35
+ reference operator[](size_t i) { return data_[i]; }
36
+ const_reference operator[](size_t i) const { return data_[i]; }
37
+
38
+ reference at(size_t i) {
39
+ if (i >= size_) throw std::out_of_range("ColumnView index out of range");
40
+ return data_[i];
41
+ }
42
+ const_reference at(size_t i) const {
43
+ if (i >= size_) throw std::out_of_range("ColumnView index out of range");
44
+ return data_[i];
45
+ }
46
+
47
+ reference front() { return data_[0]; }
48
+ const_reference front() const { return data_[0]; }
49
+
50
+ reference back() { return data_[size_ - 1]; }
51
+ const_reference back() const { return data_[size_ - 1]; }
52
+
53
+ // Iterators
54
+ iterator begin() { return data_; }
55
+ iterator end() { return data_ + size_; }
56
+ const_iterator begin() const { return data_; }
57
+ const_iterator end() const { return data_ + size_; }
58
+ const_iterator cbegin() const { return data_; }
59
+ const_iterator cend() const { return data_ + size_; }
60
+
61
+ // Capacity
62
+ bool empty() const { return size_ == 0; }
63
+ size_t size() const { return size_; }
64
+
65
+ // Data access (for Python buffer protocol and SIMD)
66
+ T* data() { return data_; }
67
+ const T* data() const { return data_; }
68
+
69
+ /// Get as std::span for modern C++ APIs
70
+ std::span<T> span() { return {data_, size_}; }
71
+ std::span<const T> span() const { return {data_, size_}; }
72
+
73
+ /// Create a subview
74
+ ColumnView subview(size_t offset, size_t count) const {
75
+ if (offset + count > size_) {
76
+ throw std::out_of_range("ColumnView subview out of range");
77
+ }
78
+ return ColumnView(const_cast<T*>(data_) + offset, count);
79
+ }
80
+
81
+ private:
82
+ T* data_;
83
+ size_t size_;
84
+ };
85
+
86
+ // Common type aliases
87
+ using Int64View = ColumnView<int64_t>;
88
+ using Float64View = ColumnView<double>;
89
+ using TimestampView = ColumnView<int64_t>;
90
+ using SymbolView = ColumnView<uint32_t>;
91
+ using BoolView = ColumnView<uint8_t>;
92
+
93
+ } // namespace wayy_db
include/wayy_db/database.hpp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/table.hpp"
4
+ #include "wayy_db/wal.hpp"
5
+
6
+ #include <memory>
7
+ #include <shared_mutex>
8
+ #include <string>
9
+ #include <unordered_map>
10
+ #include <vector>
11
+
12
+ namespace wayy_db {
13
+
14
+ /// High-level database interface managing multiple tables
15
+ class Database {
16
+ public:
17
+ /// Create an in-memory database
18
+ Database();
19
+
20
+ /// Create or open a persistent database at the given path
21
+ explicit Database(const std::string& path);
22
+
23
+ /// Move-only semantics
24
+ Database(Database&&) = default;
25
+ Database& operator=(Database&&) = default;
26
+ Database(const Database&) = delete;
27
+ Database& operator=(const Database&) = delete;
28
+
29
+ ~Database() = default;
30
+
31
+ /// Database path (empty for in-memory)
32
+ const std::string& path() const { return path_; }
33
+
34
+ /// Check if database is persistent
35
+ bool is_persistent() const { return !path_.empty(); }
36
+
37
+ /// List all table names
38
+ std::vector<std::string> tables() const;
39
+
40
+ /// Check if a table exists
41
+ bool has_table(const std::string& name) const;
42
+
43
+ /// Get a table by name (loads from disk if persistent and not cached)
44
+ Table& table(const std::string& name);
45
+ Table& operator[](const std::string& name) { return table(name); }
46
+
47
+ /// Create a new table
48
+ Table& create_table(const std::string& name);
49
+
50
+ /// Add an existing table to the database
51
+ void add_table(Table table);
52
+
53
+ /// Drop a table (removes from disk if persistent)
54
+ void drop_table(const std::string& name);
55
+
56
+ /// Save all modified tables to disk (no-op for in-memory)
57
+ void save();
58
+
59
+ /// Reload table list from disk
60
+ void refresh();
61
+
62
+ /// WAL: checkpoint (flush WAL, save tables, truncate WAL)
63
+ void checkpoint();
64
+
65
+ /// WAL: get access to WAL for logging (may be null for in-memory DB)
66
+ WriteAheadLog* wal() { return wal_.get(); }
67
+
68
+ private:
69
+ std::string path_;
70
+ std::unordered_map<std::string, Table> tables_;
71
+ std::unordered_map<std::string, bool> loaded_; // Track which tables are loaded
72
+
73
+ // Write-ahead log (persistent databases only)
74
+ std::unique_ptr<WriteAheadLog> wal_;
75
+
76
+ // Mutex for thread-safe access (mutable allows const methods to lock)
77
+ // Uses shared_mutex for concurrent reads, exclusive writes
78
+ mutable std::shared_mutex mutex_;
79
+
80
+ /// Get the directory path for a table
81
+ std::string table_path(const std::string& name) const;
82
+
83
+ /// Scan directory for existing tables
84
+ void scan_tables();
85
+ };
86
+
87
+ } // namespace wayy_db
include/wayy_db/hash_index.hpp ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <optional>
5
+ #include <string>
6
+ #include <string_view>
7
+ #include <unordered_map>
8
+
9
+ namespace wayy_db {
10
+
11
+ // Forward declarations
12
+ class Table;
13
+
14
+ /// Hash-based primary key index supporting both int64 and string keys.
15
+ class HashIndex {
16
+ public:
17
+ HashIndex() = default;
18
+
19
+ /// Build index from table column
20
+ void build_int(const Table& table, const std::string& col_name);
21
+ void build_str(const Table& table, const std::string& col_name);
22
+
23
+ /// Lookup
24
+ std::optional<size_t> find_int(int64_t key) const;
25
+ std::optional<size_t> find_str(std::string_view key) const;
26
+
27
+ /// Insert
28
+ void insert_int(int64_t key, size_t row);
29
+ void insert_str(std::string_view key, size_t row);
30
+
31
+ /// Remove
32
+ void remove_int(int64_t key);
33
+ void remove_str(std::string_view key);
34
+
35
+ /// Clear
36
+ void clear();
37
+
38
+ /// Size
39
+ size_t size() const { return int_map_.size() + str_map_.size(); }
40
+
41
+ private:
42
+ std::unordered_map<int64_t, size_t> int_map_;
43
+ std::unordered_map<std::string, size_t> str_map_;
44
+ };
45
+
46
+ } // namespace wayy_db
include/wayy_db/mmap_file.hpp ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <string>
5
+
6
+ namespace wayy_db {
7
+
8
+ /// Memory-mapped file abstraction
9
+ /// Provides platform-independent mmap operations for zero-copy I/O
10
+ class MmapFile {
11
+ public:
12
+ enum class Mode {
13
+ ReadOnly,
14
+ ReadWrite,
15
+ Create, // Create or truncate
16
+ };
17
+
18
+ /// Construct without opening
19
+ MmapFile() = default;
20
+
21
+ /// Open and map a file
22
+ explicit MmapFile(const std::string& path, Mode mode = Mode::ReadOnly,
23
+ size_t size = 0);
24
+
25
+ /// Move-only semantics
26
+ MmapFile(MmapFile&& other) noexcept;
27
+ MmapFile& operator=(MmapFile&& other) noexcept;
28
+ MmapFile(const MmapFile&) = delete;
29
+ MmapFile& operator=(const MmapFile&) = delete;
30
+
31
+ ~MmapFile();
32
+
33
+ /// Open a file for mapping
34
+ void open(const std::string& path, Mode mode = Mode::ReadOnly,
35
+ size_t size = 0);
36
+
37
+ /// Close and unmap the file
38
+ void close();
39
+
40
+ /// Check if file is open
41
+ bool is_open() const { return data_ != nullptr; }
42
+
43
+ /// Get mapped memory
44
+ void* data() { return data_; }
45
+ const void* data() const { return data_; }
46
+
47
+ /// Get mapped size
48
+ size_t size() const { return size_; }
49
+
50
+ /// Get file path
51
+ const std::string& path() const { return path_; }
52
+
53
+ /// Sync changes to disk (for ReadWrite/Create modes)
54
+ void sync();
55
+
56
+ /// Resize the mapping (only for Create mode, extends file)
57
+ void resize(size_t new_size);
58
+
59
+ private:
60
+ std::string path_;
61
+ void* data_ = nullptr;
62
+ size_t size_ = 0;
63
+ Mode mode_ = Mode::ReadOnly;
64
+ int fd_ = -1; // File descriptor (POSIX)
65
+ };
66
+
67
+ } // namespace wayy_db
include/wayy_db/ops/aggregations.hpp ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/column_view.hpp"
4
+ #include "wayy_db/column.hpp"
5
+
6
+ #include <cmath>
7
+ #include <limits>
8
+
9
+ namespace wayy_db::ops {
10
+
11
+ /// Sum of all values in a column
12
+ template<typename T>
13
+ T sum(const ColumnView<T>& col);
14
+
15
+ /// SIMD-optimized sum for float64
16
+ double sum_simd(const ColumnView<double>& col);
17
+ int64_t sum_simd(const ColumnView<int64_t>& col);
18
+
19
+ /// Mean (average) of all values
20
+ template<typename T>
21
+ double avg(const ColumnView<T>& col) {
22
+ if (col.empty()) return std::numeric_limits<double>::quiet_NaN();
23
+ return static_cast<double>(sum(col)) / static_cast<double>(col.size());
24
+ }
25
+
26
+ /// Minimum value
27
+ template<typename T>
28
+ T min(const ColumnView<T>& col);
29
+
30
+ /// Maximum value
31
+ template<typename T>
32
+ T max(const ColumnView<T>& col);
33
+
34
+ /// Standard deviation (population)
35
+ template<typename T>
36
+ double std_dev(const ColumnView<T>& col);
37
+
38
+ /// Variance (population)
39
+ template<typename T>
40
+ double variance(const ColumnView<T>& col);
41
+
42
+ /// Count non-null values (for future nullable support)
43
+ template<typename T>
44
+ size_t count(const ColumnView<T>& col) {
45
+ return col.size();
46
+ }
47
+
48
+ /// First value
49
+ template<typename T>
50
+ T first(const ColumnView<T>& col) {
51
+ if (col.empty()) throw InvalidOperation("first() on empty column");
52
+ return col.front();
53
+ }
54
+
55
+ /// Last value
56
+ template<typename T>
57
+ T last(const ColumnView<T>& col) {
58
+ if (col.empty()) throw InvalidOperation("last() on empty column");
59
+ return col.back();
60
+ }
61
+
62
+ // Type-erased aggregations on Column objects
63
+ double sum(const Column& col);
64
+ double avg(const Column& col);
65
+ double min_val(const Column& col);
66
+ double max_val(const Column& col);
67
+ double std_dev(const Column& col);
68
+
69
+ } // namespace wayy_db::ops
include/wayy_db/ops/joins.hpp ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/table.hpp"
4
+
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ namespace wayy_db::ops {
9
+
10
+ /// As-of join: for each row in left, find the most recent row in right
11
+ /// where right.as_of <= left.as_of and join keys match
12
+ ///
13
+ /// Both tables must be sorted by the as_of column
14
+ ///
15
+ /// @param left Left table (e.g., trades)
16
+ /// @param right Right table (e.g., quotes)
17
+ /// @param on Join key columns (e.g., ["symbol"])
18
+ /// @param as_of Temporal column name (e.g., "timestamp")
19
+ /// @return Joined table with columns from both tables
20
+ Table aj(const Table& left, const Table& right,
21
+ const std::vector<std::string>& on,
22
+ const std::string& as_of);
23
+
24
+ /// Window join: for each row in left, find all rows in right
25
+ /// within the specified time window
26
+ ///
27
+ /// @param left Left table
28
+ /// @param right Right table
29
+ /// @param on Join key columns
30
+ /// @param as_of Temporal column name
31
+ /// @param window_before Nanoseconds before left.as_of to include
32
+ /// @param window_after Nanoseconds after left.as_of to include
33
+ /// @return Joined table (may have more rows than left due to multiple matches)
34
+ Table wj(const Table& left, const Table& right,
35
+ const std::vector<std::string>& on,
36
+ const std::string& as_of,
37
+ int64_t window_before,
38
+ int64_t window_after);
39
+
40
+ /// Inner join on specified columns
41
+ Table inner_join(const Table& left, const Table& right,
42
+ const std::vector<std::string>& on);
43
+
44
+ /// Left join on specified columns
45
+ Table left_join(const Table& left, const Table& right,
46
+ const std::vector<std::string>& on);
47
+
48
+ } // namespace wayy_db::ops
include/wayy_db/ops/window.hpp ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/column_view.hpp"
4
+
5
+ #include <vector>
6
+
7
+ namespace wayy_db::ops {
8
+
9
+ /// Moving average over a sliding window
10
+ /// @param col Input column
11
+ /// @param window Window size
12
+ /// @return Vector of moving averages (first window-1 values are partial averages)
13
+ std::vector<double> mavg(const ColumnView<double>& col, size_t window);
14
+ std::vector<double> mavg(const ColumnView<int64_t>& col, size_t window);
15
+
16
+ /// Moving sum over a sliding window
17
+ std::vector<double> msum(const ColumnView<double>& col, size_t window);
18
+ std::vector<int64_t> msum(const ColumnView<int64_t>& col, size_t window);
19
+
20
+ /// Moving standard deviation over a sliding window
21
+ std::vector<double> mstd(const ColumnView<double>& col, size_t window);
22
+ std::vector<double> mstd(const ColumnView<int64_t>& col, size_t window);
23
+
24
+ /// Moving minimum over a sliding window (O(n) using monotonic deque)
25
+ std::vector<double> mmin(const ColumnView<double>& col, size_t window);
26
+ std::vector<int64_t> mmin(const ColumnView<int64_t>& col, size_t window);
27
+
28
+ /// Moving maximum over a sliding window (O(n) using monotonic deque)
29
+ std::vector<double> mmax(const ColumnView<double>& col, size_t window);
30
+ std::vector<int64_t> mmax(const ColumnView<int64_t>& col, size_t window);
31
+
32
+ /// Exponential moving average
33
+ /// @param col Input column
34
+ /// @param alpha Smoothing factor (0 < alpha <= 1)
35
+ /// @return Vector of EMA values
36
+ std::vector<double> ema(const ColumnView<double>& col, double alpha);
37
+ std::vector<double> ema(const ColumnView<int64_t>& col, double alpha);
38
+
39
+ /// Exponential moving average with span
40
+ /// alpha = 2 / (span + 1)
41
+ std::vector<double> ema_span(const ColumnView<double>& col, size_t span);
42
+
43
+ /// Diff: difference between consecutive values
44
+ std::vector<double> diff(const ColumnView<double>& col, size_t periods = 1);
45
+ std::vector<int64_t> diff(const ColumnView<int64_t>& col, size_t periods = 1);
46
+
47
+ /// Percent change between consecutive values
48
+ std::vector<double> pct_change(const ColumnView<double>& col, size_t periods = 1);
49
+
50
+ /// Shift values by n positions (positive = forward, negative = backward)
51
+ std::vector<double> shift(const ColumnView<double>& col, int64_t n);
52
+ std::vector<int64_t> shift(const ColumnView<int64_t>& col, int64_t n);
53
+
54
+ } // namespace wayy_db::ops
include/wayy_db/string_column.hpp ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/types.hpp"
4
+
5
+ #include <cstdint>
6
+ #include <optional>
7
+ #include <string>
8
+ #include <string_view>
9
+ #include <vector>
10
+
11
+ namespace wayy_db {
12
+
13
+ /// Arrow-style variable-length string column.
14
+ /// Storage layout:
15
+ /// offsets_: int64_t[N+1] — byte offsets into data_
16
+ /// data_: uint8_t[] — concatenated UTF-8 bytes
17
+ /// validity_: uint8_t[] — 1 bit per row (bit=1 valid, bit=0 null)
18
+ ///
19
+ /// String at row i = data_[offsets_[i] .. offsets_[i+1]]
20
+ class StringColumn {
21
+ public:
22
+ /// Construct an empty string column
23
+ explicit StringColumn(std::string name = "");
24
+
25
+ /// Move-only semantics
26
+ StringColumn(StringColumn&&) = default;
27
+ StringColumn& operator=(StringColumn&&) = default;
28
+ StringColumn(const StringColumn&) = delete;
29
+ StringColumn& operator=(const StringColumn&) = delete;
30
+
31
+ /// Column metadata
32
+ const std::string& name() const { return name_; }
33
+ DType dtype() const { return DType::String; }
34
+ size_t size() const { return offsets_.empty() ? 0 : offsets_.size() - 1; }
35
+ size_t data_bytes() const { return data_.size(); }
36
+
37
+ /// Read a string at the given row
38
+ std::string_view get(size_t row) const;
39
+
40
+ /// Append a new string
41
+ void append(std::string_view val);
42
+
43
+ /// Append a null value
44
+ void append_null();
45
+
46
+ /// Overwrite the string at a given row.
47
+ /// If the new string fits in the existing slot, it's written in-place.
48
+ /// Otherwise, old slot is wasted and the new value is appended to data_.
49
+ void set(size_t row, std::string_view val);
50
+
51
+ /// Validity bitmap
52
+ bool has_validity() const { return has_validity_; }
53
+ bool is_valid(size_t row) const;
54
+ void set_valid(size_t row, bool valid);
55
+ size_t count_valid() const;
56
+
57
+ /// Persistence
58
+ void save(const std::string& dir_path, const std::string& col_name) const;
59
+ static StringColumn load(const std::string& dir_path, const std::string& col_name);
60
+
61
+ /// Direct access for bulk operations
62
+ const std::vector<int64_t>& offsets() const { return offsets_; }
63
+ const std::vector<uint8_t>& data_buf() const { return data_; }
64
+ const std::vector<uint8_t>& validity_bitmap() const { return validity_; }
65
+
66
+ /// Collect all strings as a vector (copy)
67
+ std::vector<std::string> to_vector() const;
68
+
69
+ private:
70
+ std::string name_;
71
+ std::vector<int64_t> offsets_; // N+1 offsets
72
+ std::vector<uint8_t> data_; // Concatenated UTF-8 bytes
73
+ std::vector<uint8_t> validity_; // Null bitmap
74
+ bool has_validity_ = false;
75
+
76
+ void ensure_validity();
77
+ };
78
+
79
+ } // namespace wayy_db
include/wayy_db/table.hpp ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "wayy_db/types.hpp"
4
+ #include "wayy_db/column.hpp"
5
+ #include "wayy_db/string_column.hpp"
6
+ #include "wayy_db/mmap_file.hpp"
7
+
8
+ #include <any>
9
+ #include <memory>
10
+ #include <mutex>
11
+ #include <optional>
12
+ #include <shared_mutex>
13
+ #include <string>
14
+ #include <unordered_map>
15
+ #include <vector>
16
+
17
+ namespace wayy_db {
18
+
19
+ // Forward declarations
20
+ class HashIndex;
21
+
22
+ /// Columnar table with optional sorted index, OLTP capabilities,
23
+ /// and per-table reader-writer locking.
24
+ class Table {
25
+ public:
26
+ /// Construct an empty table
27
+ explicit Table(std::string name = "");
28
+
29
+ /// Move-only semantics (shared_mutex is non-movable, so custom move ctor)
30
+ Table(Table&& other) noexcept;
31
+ Table& operator=(Table&& other) noexcept;
32
+ Table(const Table&) = delete;
33
+ Table& operator=(const Table&) = delete;
34
+ ~Table();
35
+
36
+ /// Table metadata
37
+ const std::string& name() const { return name_; }
38
+ size_t num_rows() const { return num_rows_; }
39
+ size_t num_columns() const { return columns_.size() + string_columns_.size(); }
40
+
41
+ /// Per-table reader-writer lock
42
+ auto read_lock() const { return std::shared_lock(mu_); }
43
+ auto write_lock() { return std::unique_lock(mu_); }
44
+
45
+ /// Column management (fixed-width columns)
46
+ void add_column(Column column);
47
+ void add_column(const std::string& name, DType dtype, void* data, size_t size);
48
+
49
+ /// String column management
50
+ void add_string_column(StringColumn col);
51
+ bool has_string_column(const std::string& name) const;
52
+ StringColumn& string_column(const std::string& name);
53
+ const StringColumn& string_column(const std::string& name) const;
54
+
55
+ bool has_column(const std::string& name) const;
56
+ Column& column(const std::string& name);
57
+ const Column& column(const std::string& name) const;
58
+ Column& operator[](const std::string& name) { return column(name); }
59
+ const Column& operator[](const std::string& name) const { return column(name); }
60
+
61
+ /// Get the DType of any column (fixed or string)
62
+ DType column_dtype(const std::string& name) const;
63
+
64
+ std::vector<std::string> column_names() const;
65
+
66
+ /// Sorted index (critical for temporal joins)
67
+ void set_sorted_by(const std::string& col);
68
+ std::optional<std::string> sorted_by() const { return sorted_by_; }
69
+ bool is_sorted() const { return sorted_by_.has_value(); }
70
+
71
+ /// Primary key + hash index
72
+ void set_primary_key(const std::string& col_name);
73
+ const std::optional<std::string>& primary_key() const { return primary_key_; }
74
+ std::optional<size_t> find_row(int64_t key) const;
75
+ std::optional<size_t> find_row(std::string_view key) const;
76
+ void rebuild_index();
77
+
78
+ /// CRUD operations
79
+ size_t append_row(const std::unordered_map<std::string, std::any>& values);
80
+ bool update_row(int64_t pk, const std::unordered_map<std::string, std::any>& values);
81
+ bool update_row(std::string_view pk, const std::unordered_map<std::string, std::any>& values);
82
+ bool delete_row(int64_t pk);
83
+ bool delete_row(std::string_view pk);
84
+
85
+ /// Filter: returns vector of row indices matching predicate
86
+ std::vector<size_t> where_eq(const std::string& col, int64_t val) const;
87
+ std::vector<size_t> where_eq(const std::string& col, std::string_view val) const;
88
+
89
+ /// Compaction: physically remove deleted rows, rebuild index
90
+ void compact();
91
+
92
+ /// Persistence
93
+ void save(const std::string& dir_path) const;
94
+ static Table load(const std::string& dir_path);
95
+
96
+ /// Create from memory-mapped directory (zero-copy)
97
+ static Table mmap(const std::string& dir_path);
98
+
99
+ private:
100
+ std::string name_;
101
+ size_t num_rows_ = 0;
102
+ std::vector<Column> columns_;
103
+ std::unordered_map<std::string, size_t> column_index_;
104
+ std::optional<std::string> sorted_by_;
105
+
106
+ // String columns (separate storage)
107
+ std::vector<StringColumn> string_columns_;
108
+ std::unordered_map<std::string, size_t> string_column_index_;
109
+
110
+ // Primary key + hash index
111
+ std::optional<std::string> primary_key_;
112
+ std::unique_ptr<HashIndex> pk_index_;
113
+
114
+ // Per-table reader-writer lock
115
+ mutable std::shared_mutex mu_;
116
+
117
+ // For mmap'd tables, keep file handles alive
118
+ std::vector<MmapFile> mmap_files_;
119
+
120
+ /// Write metadata JSON
121
+ void write_metadata(const std::string& dir_path) const;
122
+
123
+ /// Read metadata JSON and return column info
124
+ static std::tuple<std::string, size_t, std::optional<std::string>,
125
+ std::optional<std::string>,
126
+ std::vector<std::pair<std::string, DType>>>
127
+ read_metadata(const std::string& dir_path);
128
+
129
+ /// Internal row update by row index (no PK lookup)
130
+ bool update_row_at(size_t row_idx, const std::unordered_map<std::string, std::any>& values);
131
+ };
132
+
133
+ } // namespace wayy_db
include/wayy_db/types.hpp ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <string>
5
+ #include <string_view>
6
+ #include <stdexcept>
7
+
8
+ namespace wayy_db {
9
+
10
+ /// Supported data types for columns
11
+ enum class DType : uint8_t {
12
+ Int64 = 0,
13
+ Float64 = 1,
14
+ Timestamp = 2, // Nanoseconds since Unix epoch
15
+ Symbol = 3, // Interned string index
16
+ Bool = 4,
17
+ String = 5, // Arrow-style variable-length UTF-8 (offsets + data)
18
+ Decimal6 = 6, // Int64 with implied 6 decimal places (max ±9.2T)
19
+ };
20
+
21
+ /// Get the size in bytes for a given type (0 for variable-length types)
22
+ constexpr size_t dtype_size(DType dtype) {
23
+ switch (dtype) {
24
+ case DType::Int64: return sizeof(int64_t);
25
+ case DType::Float64: return sizeof(double);
26
+ case DType::Timestamp: return sizeof(int64_t);
27
+ case DType::Symbol: return sizeof(uint32_t);
28
+ case DType::Bool: return sizeof(uint8_t);
29
+ case DType::String: return 0; // Variable-length, use StringColumn
30
+ case DType::Decimal6: return sizeof(int64_t); // Stored as int64
31
+ }
32
+ return 0; // Unreachable
33
+ }
34
+
35
+ /// Check if a dtype is fixed-width
36
+ constexpr bool dtype_is_fixed(DType dtype) {
37
+ return dtype != DType::String;
38
+ }
39
+
40
+ /// Convert DType to string representation
41
+ constexpr std::string_view dtype_to_string(DType dtype) {
42
+ switch (dtype) {
43
+ case DType::Int64: return "int64";
44
+ case DType::Float64: return "float64";
45
+ case DType::Timestamp: return "timestamp";
46
+ case DType::Symbol: return "symbol";
47
+ case DType::Bool: return "bool";
48
+ case DType::String: return "string";
49
+ case DType::Decimal6: return "decimal6";
50
+ }
51
+ return "unknown";
52
+ }
53
+
54
+ /// Parse DType from string
55
+ DType dtype_from_string(std::string_view s);
56
+
57
+ /// Magic number for WayyDB files: "WAYYDB\x00\x01"
58
+ constexpr uint64_t WAYY_MAGIC = 0x57415959'44420001ULL;
59
+
60
+ /// Current file format version
61
+ constexpr uint32_t WAYY_VERSION = 1;
62
+
63
+ /// Column file header (64 bytes)
64
+ struct ColumnHeader {
65
+ uint64_t magic; // WAYY_MAGIC
66
+ uint32_t version; // WAYY_VERSION
67
+ DType dtype; // Data type
68
+ uint8_t reserved1[3]; // Padding
69
+ uint64_t row_count; // Number of rows
70
+ uint64_t compression; // 0 = none, 1 = LZ4
71
+ uint8_t reserved2[24]; // Reserved for future use
72
+ uint64_t data_offset; // Offset to data (typically 64)
73
+ };
74
+
75
+ static_assert(sizeof(ColumnHeader) == 64, "ColumnHeader must be 64 bytes");
76
+
77
+ /// Exception types
78
+ class WayyException : public std::runtime_error {
79
+ using std::runtime_error::runtime_error;
80
+ };
81
+
82
+ class ColumnNotFound : public WayyException {
83
+ public:
84
+ explicit ColumnNotFound(const std::string& name)
85
+ : WayyException("Column not found: " + name) {}
86
+ };
87
+
88
+ class TypeMismatch : public WayyException {
89
+ public:
90
+ TypeMismatch(DType expected, DType actual)
91
+ : WayyException("Type mismatch: expected " +
92
+ std::string(dtype_to_string(expected)) +
93
+ ", got " + std::string(dtype_to_string(actual))) {}
94
+ };
95
+
96
+ class InvalidOperation : public WayyException {
97
+ using WayyException::WayyException;
98
+ };
99
+
100
+ } // namespace wayy_db
include/wayy_db/wal.hpp ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <fstream>
5
+ #include <mutex>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ namespace wayy_db {
10
+
11
+ // Forward declaration
12
+ class Database;
13
+
14
+ /// WAL operation types
15
+ enum class WalOp : uint8_t {
16
+ Insert = 1,
17
+ Update = 2,
18
+ Delete = 3,
19
+ };
20
+
21
+ /// WAL magic number
22
+ constexpr uint32_t WAL_MAGIC = 0x57414C01; // "WAL\x01"
23
+
24
+ /// Binary WAL entry format:
25
+ /// [4B magic][1B op_type][4B table_name_len][table_name]
26
+ /// [8B row_id][4B payload_len][payload][4B CRC32]
27
+ ///
28
+ /// For Insert: payload = serialized row (col_name:type:data pairs)
29
+ /// For Update: payload = serialized partial row (only changed columns)
30
+ /// For Delete: payload = empty
31
+
32
+ class WriteAheadLog {
33
+ public:
34
+ /// Create or open a WAL at the given directory
35
+ explicit WriteAheadLog(const std::string& db_path);
36
+
37
+ ~WriteAheadLog();
38
+
39
+ /// Log an insert operation
40
+ void log_insert(const std::string& table, size_t row,
41
+ const std::vector<uint8_t>& data);
42
+
43
+ /// Log an update operation
44
+ void log_update(const std::string& table, size_t row,
45
+ const std::string& col, const std::vector<uint8_t>& data);
46
+
47
+ /// Log a delete operation
48
+ void log_delete(const std::string& table, size_t row);
49
+
50
+ /// Checkpoint: flush WAL, save all tables, truncate WAL
51
+ void checkpoint(Database& db);
52
+
53
+ /// Replay WAL entries to recover state after crash
54
+ void replay(Database& db);
55
+
56
+ /// Check if WAL has unprocessed entries
57
+ bool has_entries() const;
58
+
59
+ /// Get WAL file path
60
+ const std::string& path() const { return path_; }
61
+
62
+ private:
63
+ std::string path_;
64
+ std::ofstream file_;
65
+ mutable std::mutex mu_;
66
+
67
+ /// Write a raw entry to the WAL file
68
+ void write_entry(WalOp op, const std::string& table, size_t row,
69
+ const std::vector<uint8_t>& payload);
70
+
71
+ /// Compute CRC32 over buffer
72
+ static uint32_t crc32(const uint8_t* data, size_t len);
73
+
74
+ /// Open WAL file for appending
75
+ void open_for_append();
76
+ };
77
+
78
+ } // namespace wayy_db
include/wayy_db/wayy_db.hpp ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ /// Main header that includes all WayyDB components
4
+
5
+ #include "wayy_db/types.hpp"
6
+ #include "wayy_db/column_view.hpp"
7
+ #include "wayy_db/column.hpp"
8
+ #include "wayy_db/string_column.hpp"
9
+ #include "wayy_db/hash_index.hpp"
10
+ #include "wayy_db/table.hpp"
11
+ #include "wayy_db/wal.hpp"
12
+ #include "wayy_db/database.hpp"
13
+ #include "wayy_db/mmap_file.hpp"
14
+ #include "wayy_db/ops/aggregations.hpp"
15
+ #include "wayy_db/ops/joins.hpp"
16
+ #include "wayy_db/ops/window.hpp"
pyproject.toml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["scikit-build-core>=0.5", "pybind11>=2.13"]
3
+ build-backend = "scikit_build_core.build"
4
+
5
+ [project]
6
+ name = "wayy-db"
7
+ version = "0.1.0"
8
+ description = "High-performance columnar time-series database with kdb+-like functionality"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Wayy Research", email = "dev@wayy.io"}
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "Intended Audience :: Financial and Insurance Industry",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Programming Language :: C++",
26
+ "Topic :: Database",
27
+ "Topic :: Scientific/Engineering :: Information Analysis",
28
+ ]
29
+ keywords = [
30
+ "database",
31
+ "time-series",
32
+ "columnar",
33
+ "kdb",
34
+ "as-of-join",
35
+ "quantitative-finance",
36
+ "trading",
37
+ "numpy",
38
+ "high-performance",
39
+ ]
40
+ dependencies = [
41
+ "numpy>=1.20",
42
+ ]
43
+
44
+ [project.optional-dependencies]
45
+ cli = [
46
+ "typer>=0.9",
47
+ "httpx>=0.25",
48
+ "websockets>=12.0",
49
+ "rich>=13.0",
50
+ ]
51
+ api = [
52
+ "fastapi>=0.109.0",
53
+ "uvicorn[standard]>=0.27.0",
54
+ "pydantic>=2.0",
55
+ "websockets>=12.0",
56
+ "redis[hiredis]>=5.0",
57
+ ]
58
+ dev = [
59
+ "pytest>=7.0",
60
+ "pytest-cov",
61
+ "pytest-asyncio>=0.21",
62
+ "httpx>=0.25",
63
+ "pandas>=2.0",
64
+ "polars>=0.20",
65
+ "hypothesis>=6.0",
66
+ "mypy>=1.0",
67
+ "ruff>=0.1",
68
+ ]
69
+ bench = [
70
+ "pandas>=2.0",
71
+ "polars>=0.20",
72
+ "duckdb>=0.9",
73
+ "psutil>=5.0",
74
+ "pytest-benchmark",
75
+ "memory-profiler",
76
+ ]
77
+ docs = [
78
+ "sphinx>=7.0",
79
+ "sphinx-rtd-theme",
80
+ "myst-parser",
81
+ ]
82
+
83
+ [project.scripts]
84
+ wayy = "wayy_db.cli.main:app"
85
+ wayy-db-bench = "benchmarks.benchmark:main"
86
+
87
+ [project.urls]
88
+ Homepage = "https://github.com/wayy-research/wayydb"
89
+ Documentation = "https://wayydb.readthedocs.io"
90
+
91
+ [tool.scikit-build]
92
+ cmake.args = ["-DWAYY_BUILD_PYTHON=ON", "-DWAYY_BUILD_TESTS=OFF"]
93
+ wheel.packages = ["python/wayy_db"]
94
+
95
+ [tool.cibuildwheel]
96
+ build-verbosity = 1
97
+ # Build for Python 3.9-3.13, including free-threaded 3.13
98
+ build = "cp39-* cp310-* cp311-* cp312-* cp313-* cp313t-*"
99
+ skip = "*-musllinux_* *-win32 *-manylinux_i686"
100
+
101
+ # Free-threaded Python 3.13 (no-GIL) configuration
102
+ [tool.cibuildwheel.free-threaded]
103
+ # Enable free-threaded builds on all platforms
104
+ build = "cp313t-*"
105
+
106
+ [[tool.cibuildwheel.overrides]]
107
+ # For free-threaded builds, ensure we're using the right Python
108
+ select = "cp313t-*"
109
+ inherit.environment = "append"
110
+
111
+ [tool.pytest.ini_options]
112
+ testpaths = ["tests/python"]
113
+ python_files = ["test_*.py"]
114
+ addopts = "-v --tb=short"
115
+ asyncio_mode = "strict"
116
+
117
+ [tool.ruff]
118
+ target-version = "py39"
119
+ line-length = 100
120
+
121
+ [tool.ruff.lint]
122
+ select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
123
+
124
+ [tool.mypy]
125
+ python_version = "3.9"
126
+ warn_return_any = true
127
+ warn_unused_configs = true
python/bindings.cpp ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/numpy.h>
3
+ #include <pybind11/stl.h>
4
+
5
+ #include "wayy_db/wayy_db.hpp"
6
+
7
+ #include <any>
8
+
9
+ namespace py = pybind11;
10
+
11
+ // GIL release guard for concurrent read operations
12
+ using release_gil = py::call_guard<py::gil_scoped_release>;
13
+
14
+ using namespace wayy_db;
15
+
16
+ // Namespace alias to avoid collision with local variable
17
+ namespace wdb_ops = wayy_db::ops;
18
+
19
+ // Helper to convert numpy dtype to WayyDB DType
20
+ DType numpy_dtype_to_wayy(py::dtype dt) {
21
+ if (dt.is(py::dtype::of<int64_t>())) return DType::Int64;
22
+ if (dt.is(py::dtype::of<double>())) return DType::Float64;
23
+ if (dt.is(py::dtype::of<uint32_t>())) return DType::Symbol;
24
+ if (dt.is(py::dtype::of<uint8_t>())) return DType::Bool;
25
+ throw std::runtime_error("Unsupported numpy dtype");
26
+ }
27
+
28
+ // Helper to get numpy dtype from WayyDB DType
29
+ py::dtype wayy_dtype_to_numpy(DType dt) {
30
+ switch (dt) {
31
+ case DType::Int64:
32
+ case DType::Timestamp:
33
+ case DType::Decimal6:
34
+ return py::dtype::of<int64_t>();
35
+ case DType::Float64:
36
+ return py::dtype::of<double>();
37
+ case DType::Symbol:
38
+ return py::dtype::of<uint32_t>();
39
+ case DType::Bool:
40
+ return py::dtype::of<uint8_t>();
41
+ case DType::String:
42
+ throw std::runtime_error("String columns use StringColumn, not numpy");
43
+ }
44
+ throw std::runtime_error("Unknown dtype");
45
+ }
46
+
47
+ // Helper: convert Python dict to std::unordered_map<string, std::any>
48
+ std::unordered_map<std::string, std::any> py_dict_to_any_map(
49
+ py::dict d, Table& table) {
50
+ std::unordered_map<std::string, std::any> result;
51
+ for (auto& [key, val] : d) {
52
+ std::string col_name = py::str(key);
53
+ DType dt = table.column_dtype(col_name);
54
+
55
+ if (dt == DType::String) {
56
+ result[col_name] = std::string(py::str(val));
57
+ } else if (dt == DType::Int64 || dt == DType::Timestamp || dt == DType::Decimal6) {
58
+ result[col_name] = py::cast<int64_t>(val);
59
+ } else if (dt == DType::Float64) {
60
+ result[col_name] = py::cast<double>(val);
61
+ } else if (dt == DType::Symbol) {
62
+ result[col_name] = py::cast<uint32_t>(val);
63
+ } else if (dt == DType::Bool) {
64
+ result[col_name] = py::cast<uint8_t>(val);
65
+ }
66
+ }
67
+ return result;
68
+ }
69
+
70
+ PYBIND11_MODULE(_core, m, py::mod_gil_not_used()) {
71
+ m.doc() = "WayyDB: High-performance columnar time-series database (free-threading safe)";
72
+
73
+ // DType enum
74
+ py::enum_<DType>(m, "DType")
75
+ .value("Int64", DType::Int64)
76
+ .value("Float64", DType::Float64)
77
+ .value("Timestamp", DType::Timestamp)
78
+ .value("Symbol", DType::Symbol)
79
+ .value("Bool", DType::Bool)
80
+ .value("String", DType::String)
81
+ .value("Decimal6", DType::Decimal6)
82
+ .export_values();
83
+
84
+ // Exceptions
85
+ py::register_exception<WayyException>(m, "WayyException");
86
+ py::register_exception<ColumnNotFound>(m, "ColumnNotFound");
87
+ py::register_exception<TypeMismatch>(m, "TypeMismatch");
88
+ py::register_exception<InvalidOperation>(m, "InvalidOperation");
89
+
90
+ // Column class
91
+ py::class_<Column>(m, "Column")
92
+ .def_property_readonly("name", &Column::name)
93
+ .def_property_readonly("dtype", &Column::dtype)
94
+ .def_property_readonly("size", &Column::size)
95
+ .def("__len__", &Column::size)
96
+ .def("to_numpy", [](Column& self) -> py::array {
97
+ py::dtype dt = wayy_dtype_to_numpy(self.dtype());
98
+ return py::array(dt, {self.size()}, {dtype_size(self.dtype())},
99
+ self.data(), py::cast(self));
100
+ }, py::return_value_policy::reference_internal,
101
+ "Zero-copy view as numpy array")
102
+ .def("is_valid", &Column::is_valid, py::arg("row"),
103
+ "Check if row is valid (not null/deleted)")
104
+ .def("count_valid", &Column::count_valid,
105
+ "Count non-null/non-deleted rows");
106
+
107
+ // StringColumn class
108
+ py::class_<StringColumn>(m, "StringColumn")
109
+ .def(py::init<std::string>(), py::arg("name") = "")
110
+ .def_property_readonly("name", &StringColumn::name)
111
+ .def_property_readonly("dtype", &StringColumn::dtype)
112
+ .def_property_readonly("size", &StringColumn::size)
113
+ .def("__len__", &StringColumn::size)
114
+ .def("get", &StringColumn::get, py::arg("row"),
115
+ "Get string at row index")
116
+ .def("append", &StringColumn::append, py::arg("val"),
117
+ "Append a string value")
118
+ .def("set", &StringColumn::set, py::arg("row"), py::arg("val"),
119
+ "Set string at row index")
120
+ .def("is_valid", &StringColumn::is_valid, py::arg("row"))
121
+ .def("count_valid", &StringColumn::count_valid)
122
+ .def("to_list", &StringColumn::to_vector,
123
+ "Get all strings as a Python list");
124
+
125
+ // Table class
126
+ py::class_<Table>(m, "Table")
127
+ .def(py::init<std::string>(), py::arg("name") = "")
128
+ .def_property_readonly("name", &Table::name)
129
+ .def_property_readonly("num_rows", &Table::num_rows)
130
+ .def_property_readonly("num_columns", &Table::num_columns)
131
+ .def_property_readonly("sorted_by", [](const Table& t) -> py::object {
132
+ if (t.sorted_by()) return py::cast(*t.sorted_by());
133
+ return py::none();
134
+ })
135
+ .def_property_readonly("primary_key", [](const Table& t) -> py::object {
136
+ if (t.primary_key()) return py::cast(*t.primary_key());
137
+ return py::none();
138
+ })
139
+ .def("__len__", &Table::num_rows)
140
+ .def("has_column", &Table::has_column)
141
+ .def("column", py::overload_cast<const std::string&>(&Table::column),
142
+ py::return_value_policy::reference_internal)
143
+ .def("__getitem__", py::overload_cast<const std::string&>(&Table::column),
144
+ py::return_value_policy::reference_internal)
145
+ .def("has_string_column", &Table::has_string_column)
146
+ .def("string_column", py::overload_cast<const std::string&>(&Table::string_column),
147
+ py::return_value_policy::reference_internal)
148
+ .def("column_dtype", &Table::column_dtype, py::arg("name"),
149
+ "Get the DType of any column (fixed or string)")
150
+ .def("column_names", &Table::column_names)
151
+ .def("set_sorted_by", &Table::set_sorted_by)
152
+ .def("set_primary_key", &Table::set_primary_key, py::arg("col_name"),
153
+ "Set the primary key column and build hash index")
154
+ .def("rebuild_index", &Table::rebuild_index,
155
+ "Rebuild the primary key hash index")
156
+ // CRUD operations
157
+ .def("append_row", [](Table& self, py::dict values) -> size_t {
158
+ auto map = py_dict_to_any_map(values, self);
159
+ return self.append_row(map);
160
+ }, py::arg("values"), "Append a row from a dict, returns row index")
161
+ .def("update_row", [](Table& self, py::object pk, py::dict values) -> bool {
162
+ auto map = py_dict_to_any_map(values, self);
163
+ if (py::isinstance<py::int_>(pk)) {
164
+ return self.update_row(py::cast<int64_t>(pk), map);
165
+ } else {
166
+ return self.update_row(std::string(py::str(pk)), map);
167
+ }
168
+ }, py::arg("pk"), py::arg("values"), "Update row by primary key")
169
+ .def("delete_row", [](Table& self, py::object pk) -> bool {
170
+ if (py::isinstance<py::int_>(pk)) {
171
+ return self.delete_row(py::cast<int64_t>(pk));
172
+ } else {
173
+ return self.delete_row(std::string(py::str(pk)));
174
+ }
175
+ }, py::arg("pk"), "Soft-delete row by primary key")
176
+ .def("find_row", [](const Table& self, py::object pk) -> py::object {
177
+ std::optional<size_t> row;
178
+ if (py::isinstance<py::int_>(pk)) {
179
+ row = self.find_row(py::cast<int64_t>(pk));
180
+ } else {
181
+ row = self.find_row(std::string(py::str(pk)));
182
+ }
183
+ if (row) return py::cast(*row);
184
+ return py::none();
185
+ }, py::arg("pk"), "Find row index by primary key")
186
+ .def("where_eq", [](const Table& self, const std::string& col, py::object val) -> py::list {
187
+ std::vector<size_t> rows;
188
+ DType dt = self.column_dtype(col);
189
+ if (dt == DType::String) {
190
+ rows = self.where_eq(col, std::string(py::str(val)));
191
+ } else {
192
+ rows = self.where_eq(col, py::cast<int64_t>(val));
193
+ }
194
+ py::list result;
195
+ for (auto r : rows) result.append(r);
196
+ return result;
197
+ }, py::arg("col"), py::arg("val"), "Filter rows where col == val")
198
+ .def("compact", &Table::compact,
199
+ "Physically remove deleted rows and rebuild index")
200
+ .def("save", &Table::save)
201
+ .def_static("load", &Table::load)
202
+ .def_static("mmap", &Table::mmap)
203
+ .def("add_column_from_numpy", [](Table& self, const std::string& name,
204
+ py::array arr, DType dtype) {
205
+ py::buffer_info buf = arr.request();
206
+ if (buf.ndim != 1) {
207
+ throw std::runtime_error("Array must be 1-dimensional");
208
+ }
209
+ // Copy data into owned buffer
210
+ size_t elem_size = dtype_size(dtype);
211
+ std::vector<uint8_t> data(buf.size * elem_size);
212
+ std::memcpy(data.data(), buf.ptr, data.size());
213
+ self.add_column(Column(name, dtype, std::move(data)));
214
+ }, py::arg("name"), py::arg("array"), py::arg("dtype"))
215
+ .def("add_string_column_from_list", [](Table& self, const std::string& name,
216
+ py::list strings) {
217
+ StringColumn sc(name);
218
+ for (auto& item : strings) {
219
+ if (item.is_none()) {
220
+ sc.append_null();
221
+ } else {
222
+ sc.append(std::string(py::str(item)));
223
+ }
224
+ }
225
+ self.add_string_column(std::move(sc));
226
+ }, py::arg("name"), py::arg("strings"),
227
+ "Add a string column from a Python list")
228
+ .def("to_dict", [](Table& self) -> py::dict {
229
+ py::dict result;
230
+ for (const auto& col_name : self.column_names()) {
231
+ if (self.has_string_column(col_name)) {
232
+ auto& scol = self.string_column(col_name);
233
+ result[py::cast(col_name)] = py::cast(scol.to_vector());
234
+ } else {
235
+ Column& col = self.column(col_name);
236
+ py::dtype dt = wayy_dtype_to_numpy(col.dtype());
237
+ // Make a copy for the dict
238
+ py::array arr(dt, {col.size()}, {dtype_size(col.dtype())}, col.data());
239
+ result[py::cast(col_name)] = arr.attr("copy")();
240
+ }
241
+ }
242
+ return result;
243
+ });
244
+
245
+ // Database class
246
+ py::class_<Database>(m, "Database")
247
+ .def(py::init<>())
248
+ .def(py::init<const std::string&>(), py::arg("path"))
249
+ .def_property_readonly("path", &Database::path)
250
+ .def_property_readonly("is_persistent", &Database::is_persistent)
251
+ .def("tables", &Database::tables)
252
+ .def("has_table", &Database::has_table)
253
+ .def("table", &Database::table, py::return_value_policy::reference_internal)
254
+ .def("__getitem__", &Database::table, py::return_value_policy::reference_internal)
255
+ .def("create_table", &Database::create_table, py::return_value_policy::reference_internal)
256
+ .def("add_table", [](Database& db, Table& table) {
257
+ db.add_table(std::move(table));
258
+ })
259
+ .def("drop_table", &Database::drop_table)
260
+ .def("save", &Database::save)
261
+ .def("refresh", &Database::refresh)
262
+ .def("checkpoint", &Database::checkpoint,
263
+ "Flush WAL, save all tables, truncate WAL");
264
+
265
+ // Operations submodule
266
+ py::module_ ops_mod = m.def_submodule("ops", "WayyDB operations");
267
+
268
+ // Aggregations - use lambdas to avoid overload issues
269
+ // All aggregations release the GIL for concurrent execution
270
+ ops_mod.def("sum", [](const Column& col) { return wdb_ops::sum(col); },
271
+ py::arg("col"), release_gil(), "Sum of column values");
272
+ ops_mod.def("avg", [](const Column& col) { return wdb_ops::avg(col); },
273
+ py::arg("col"), release_gil(), "Average of column values");
274
+ ops_mod.def("min", [](const Column& col) { return wdb_ops::min_val(col); },
275
+ py::arg("col"), release_gil(), "Minimum value");
276
+ ops_mod.def("max", [](const Column& col) { return wdb_ops::max_val(col); },
277
+ py::arg("col"), release_gil(), "Maximum value");
278
+ ops_mod.def("std", [](const Column& col) { return wdb_ops::std_dev(col); },
279
+ py::arg("col"), release_gil(), "Standard deviation");
280
+
281
+ // Joins - release GIL for concurrent execution
282
+ ops_mod.def("aj", &wdb_ops::aj,
283
+ py::arg("left"), py::arg("right"), py::arg("on"), py::arg("as_of"),
284
+ release_gil(),
285
+ "As-of join: find most recent right row for each left row");
286
+ ops_mod.def("wj", &wdb_ops::wj,
287
+ py::arg("left"), py::arg("right"), py::arg("on"), py::arg("as_of"),
288
+ py::arg("window_before"), py::arg("window_after"),
289
+ release_gil(),
290
+ "Window join: find all right rows within time window");
291
+
292
+ // Window functions (returning numpy arrays)
293
+ // These compute with GIL released, then briefly reacquire to create numpy array
294
+ ops_mod.def("mavg", [](Column& col, size_t window) -> py::array_t<double> {
295
+ std::vector<double> result;
296
+ {
297
+ py::gil_scoped_release release;
298
+ result = wdb_ops::mavg(col.as_float64(), window);
299
+ }
300
+ return py::array_t<double>(result.size(), result.data());
301
+ }, py::arg("col"), py::arg("window"), "Moving average");
302
+
303
+ ops_mod.def("msum", [](Column& col, size_t window) -> py::array_t<double> {
304
+ std::vector<double> result;
305
+ {
306
+ py::gil_scoped_release release;
307
+ result = wdb_ops::msum(col.as_float64(), window);
308
+ }
309
+ return py::array_t<double>(result.size(), result.data());
310
+ }, py::arg("col"), py::arg("window"), "Moving sum");
311
+
312
+ ops_mod.def("mstd", [](Column& col, size_t window) -> py::array_t<double> {
313
+ std::vector<double> result;
314
+ {
315
+ py::gil_scoped_release release;
316
+ result = wdb_ops::mstd(col.as_float64(), window);
317
+ }
318
+ return py::array_t<double>(result.size(), result.data());
319
+ }, py::arg("col"), py::arg("window"), "Moving standard deviation");
320
+
321
+ ops_mod.def("mmin", [](Column& col, size_t window) -> py::array_t<double> {
322
+ std::vector<double> result;
323
+ {
324
+ py::gil_scoped_release release;
325
+ result = wdb_ops::mmin(col.as_float64(), window);
326
+ }
327
+ return py::array_t<double>(result.size(), result.data());
328
+ }, py::arg("col"), py::arg("window"), "Moving minimum");
329
+
330
+ ops_mod.def("mmax", [](Column& col, size_t window) -> py::array_t<double> {
331
+ std::vector<double> result;
332
+ {
333
+ py::gil_scoped_release release;
334
+ result = wdb_ops::mmax(col.as_float64(), window);
335
+ }
336
+ return py::array_t<double>(result.size(), result.data());
337
+ }, py::arg("col"), py::arg("window"), "Moving maximum");
338
+
339
+ ops_mod.def("ema", [](Column& col, double alpha) -> py::array_t<double> {
340
+ std::vector<double> result;
341
+ {
342
+ py::gil_scoped_release release;
343
+ result = wdb_ops::ema(col.as_float64(), alpha);
344
+ }
345
+ return py::array_t<double>(result.size(), result.data());
346
+ }, py::arg("col"), py::arg("alpha"), "Exponential moving average");
347
+
348
+ ops_mod.def("diff", [](Column& col, size_t periods) -> py::array_t<double> {
349
+ std::vector<double> result;
350
+ {
351
+ py::gil_scoped_release release;
352
+ result = wdb_ops::diff(col.as_float64(), periods);
353
+ }
354
+ return py::array_t<double>(result.size(), result.data());
355
+ }, py::arg("col"), py::arg("periods") = 1, "Difference between consecutive values");
356
+
357
+ ops_mod.def("pct_change", [](Column& col, size_t periods) -> py::array_t<double> {
358
+ std::vector<double> result;
359
+ {
360
+ py::gil_scoped_release release;
361
+ result = wdb_ops::pct_change(col.as_float64(), periods);
362
+ }
363
+ return py::array_t<double>(result.size(), result.data());
364
+ }, py::arg("col"), py::arg("periods") = 1, "Percent change");
365
+
366
+ ops_mod.def("shift", [](Column& col, int64_t n) -> py::array_t<double> {
367
+ std::vector<double> result;
368
+ {
369
+ py::gil_scoped_release release;
370
+ result = wdb_ops::shift(col.as_float64(), n);
371
+ }
372
+ return py::array_t<double>(result.size(), result.data());
373
+ }, py::arg("col"), py::arg("n"), "Shift values by n positions");
374
+
375
+ // Version info
376
+ m.attr("__version__") = "0.2.0";
377
+ }
python/wayy_db/__init__.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WayyDB: High-performance columnar time-series database
3
+
4
+ A kdb+-like database with Python-first API, featuring:
5
+ - As-of joins (aj) and window joins (wj)
6
+ - Zero-copy numpy interop via memory mapping
7
+ - SIMD-accelerated aggregations
8
+ - Columnar storage with sorted indices
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from wayy_db._core import (
14
+ # Core classes
15
+ Database,
16
+ Table,
17
+ Column,
18
+ StringColumn,
19
+ # Types
20
+ DType,
21
+ # Exceptions
22
+ WayyException,
23
+ ColumnNotFound,
24
+ TypeMismatch,
25
+ InvalidOperation,
26
+ # Version
27
+ __version__,
28
+ )
29
+
30
+ # Operations module
31
+ from wayy_db import ops
32
+
33
+ __all__ = [
34
+ # Core classes
35
+ "Database",
36
+ "Table",
37
+ "Column",
38
+ "StringColumn",
39
+ # Types
40
+ "DType",
41
+ # Exceptions
42
+ "WayyException",
43
+ "ColumnNotFound",
44
+ "TypeMismatch",
45
+ "InvalidOperation",
46
+ # Submodules
47
+ "ops",
48
+ # Version
49
+ "__version__",
50
+ ]
51
+
52
+
53
+ def from_dict(data: dict, name: str = "", sorted_by: str | None = None) -> Table:
54
+ """Create a Table from a dictionary of numpy arrays.
55
+
56
+ Args:
57
+ data: Dictionary mapping column names to numpy arrays
58
+ name: Optional table name
59
+ sorted_by: Optional column name to mark as sorted index
60
+
61
+ Returns:
62
+ Table with the provided data
63
+ """
64
+ import numpy as np
65
+
66
+ table = Table(name)
67
+
68
+ dtype_map = {
69
+ np.dtype("int64"): DType.Int64,
70
+ np.dtype("float64"): DType.Float64,
71
+ np.dtype("uint32"): DType.Symbol,
72
+ np.dtype("uint8"): DType.Bool,
73
+ }
74
+
75
+ for col_name, arr in data.items():
76
+ arr = np.asarray(arr)
77
+ if arr.dtype not in dtype_map:
78
+ # Try to convert
79
+ if np.issubdtype(arr.dtype, np.integer):
80
+ arr = arr.astype(np.int64)
81
+ elif np.issubdtype(arr.dtype, np.floating):
82
+ arr = arr.astype(np.float64)
83
+ else:
84
+ raise ValueError(f"Unsupported dtype {arr.dtype} for column {col_name}")
85
+
86
+ dtype = dtype_map[arr.dtype]
87
+ table.add_column_from_numpy(col_name, arr, dtype)
88
+
89
+ if sorted_by is not None:
90
+ table.set_sorted_by(sorted_by)
91
+
92
+ return table
93
+
94
+
95
+ def from_pandas(df, name: str = "", sorted_by: str | None = None) -> Table:
96
+ """Create a Table from a pandas DataFrame.
97
+
98
+ Args:
99
+ df: pandas DataFrame
100
+ name: Optional table name
101
+ sorted_by: Optional column name to mark as sorted index
102
+
103
+ Returns:
104
+ Table with the DataFrame data
105
+ """
106
+ data = {col: df[col].values for col in df.columns}
107
+ return from_dict(data, name=name, sorted_by=sorted_by)
108
+
109
+
110
+ def from_polars(df, name: str = "", sorted_by: str | None = None) -> Table:
111
+ """Create a Table from a polars DataFrame.
112
+
113
+ Args:
114
+ df: polars DataFrame
115
+ name: Optional table name
116
+ sorted_by: Optional column name to mark as sorted index
117
+
118
+ Returns:
119
+ Table with the DataFrame data
120
+ """
121
+ data = {col: df[col].to_numpy() for col in df.columns}
122
+ return from_dict(data, name=name, sorted_by=sorted_by)
python/wayy_db/_core.pyi ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Type stubs for wayy_db._core C++ extension module."""
2
+
3
+ from typing import Optional, Sequence
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+
7
+ __version__: str
8
+
9
+ class DType:
10
+ Int64: DType
11
+ Float64: DType
12
+ Timestamp: DType
13
+ Symbol: DType
14
+ Bool: DType
15
+
16
+ class WayyException(Exception): ...
17
+ class ColumnNotFound(WayyException): ...
18
+ class TypeMismatch(WayyException): ...
19
+ class InvalidOperation(WayyException): ...
20
+
21
+ class Column:
22
+ @property
23
+ def name(self) -> str: ...
24
+ @property
25
+ def dtype(self) -> DType: ...
26
+ @property
27
+ def size(self) -> int: ...
28
+ def __len__(self) -> int: ...
29
+ def to_numpy(self) -> npt.NDArray: ...
30
+
31
+ class Table:
32
+ def __init__(self, name: str = "") -> None: ...
33
+ @property
34
+ def name(self) -> str: ...
35
+ @property
36
+ def num_rows(self) -> int: ...
37
+ @property
38
+ def num_columns(self) -> int: ...
39
+ @property
40
+ def sorted_by(self) -> Optional[str]: ...
41
+ def __len__(self) -> int: ...
42
+ def has_column(self, name: str) -> bool: ...
43
+ def column(self, name: str) -> Column: ...
44
+ def __getitem__(self, name: str) -> Column: ...
45
+ def column_names(self) -> list[str]: ...
46
+ def set_sorted_by(self, col: str) -> None: ...
47
+ def save(self, path: str) -> None: ...
48
+ @staticmethod
49
+ def load(path: str) -> Table: ...
50
+ @staticmethod
51
+ def mmap(path: str) -> Table: ...
52
+ def add_column_from_numpy(
53
+ self, name: str, array: npt.NDArray, dtype: DType
54
+ ) -> None: ...
55
+ def to_dict(self) -> dict[str, npt.NDArray]: ...
56
+
57
+ class Database:
58
+ def __init__(self, path: str = "") -> None: ...
59
+ @property
60
+ def path(self) -> str: ...
61
+ @property
62
+ def is_persistent(self) -> bool: ...
63
+ def tables(self) -> list[str]: ...
64
+ def has_table(self, name: str) -> bool: ...
65
+ def table(self, name: str) -> Table: ...
66
+ def __getitem__(self, name: str) -> Table: ...
67
+ def create_table(self, name: str) -> Table: ...
68
+ def drop_table(self, name: str) -> None: ...
69
+ def save(self) -> None: ...
70
+ def refresh(self) -> None: ...
71
+
72
+ class ops:
73
+ @staticmethod
74
+ def sum(col: Column) -> float: ...
75
+ @staticmethod
76
+ def avg(col: Column) -> float: ...
77
+ @staticmethod
78
+ def min(col: Column) -> float: ...
79
+ @staticmethod
80
+ def max(col: Column) -> float: ...
81
+ @staticmethod
82
+ def std(col: Column) -> float: ...
83
+ @staticmethod
84
+ def aj(
85
+ left: Table, right: Table, on: Sequence[str], as_of: str
86
+ ) -> Table: ...
87
+ @staticmethod
88
+ def wj(
89
+ left: Table,
90
+ right: Table,
91
+ on: Sequence[str],
92
+ as_of: str,
93
+ window_before: int,
94
+ window_after: int,
95
+ ) -> Table: ...
96
+ @staticmethod
97
+ def mavg(col: Column, window: int) -> npt.NDArray[np.float64]: ...
98
+ @staticmethod
99
+ def msum(col: Column, window: int) -> npt.NDArray[np.float64]: ...
100
+ @staticmethod
101
+ def mstd(col: Column, window: int) -> npt.NDArray[np.float64]: ...
102
+ @staticmethod
103
+ def mmin(col: Column, window: int) -> npt.NDArray[np.float64]: ...
104
+ @staticmethod
105
+ def mmax(col: Column, window: int) -> npt.NDArray[np.float64]: ...
106
+ @staticmethod
107
+ def ema(col: Column, alpha: float) -> npt.NDArray[np.float64]: ...
108
+ @staticmethod
109
+ def diff(col: Column, periods: int = 1) -> npt.NDArray[np.float64]: ...
110
+ @staticmethod
111
+ def pct_change(col: Column, periods: int = 1) -> npt.NDArray[np.float64]: ...
112
+ @staticmethod
113
+ def shift(col: Column, n: int) -> npt.NDArray[np.float64]: ...
python/wayy_db/cli/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """WayyDB CLI - command-line interface for the WayyDB service."""
python/wayy_db/cli/client.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTTP client for the WayyDB service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any, NoReturn, Optional
8
+
9
+ import httpx
10
+
11
+ from wayy_db.cli.config import get_server_url
12
+
13
+ # The API uses /api/v1/{db_name}/... for OLTP routes but db_name is unused
14
+ # server-side (single global db). We hardcode "default" for forward compat.
15
+ _DB_NAME = "default"
16
+
17
+
18
+ class WayyClientError(Exception):
19
+ """Raised when the WayyDB service returns an error."""
20
+
21
+ def __init__(self, status_code: int, detail: str) -> None:
22
+ self.status_code = status_code
23
+ self.detail = detail
24
+ super().__init__(f"HTTP {status_code}: {detail}")
25
+
26
+
27
+ class WayyClient:
28
+ """Synchronous HTTP client for the WayyDB REST API."""
29
+
30
+ def __init__(self, base_url: Optional[str] = None, timeout: float = 30.0) -> None:
31
+ self.base_url = (base_url or get_server_url()).rstrip("/")
32
+ self._client = httpx.Client(base_url=self.base_url, timeout=timeout)
33
+
34
+ def _request(self, method: str, path: str, **kwargs: Any) -> Any:
35
+ """Make an HTTP request and return JSON response."""
36
+ try:
37
+ resp = self._client.request(method, path, **kwargs)
38
+ except httpx.ConnectError:
39
+ raise WayyClientError(0, f"Cannot connect to {self.base_url}")
40
+ if resp.status_code >= 400:
41
+ try:
42
+ detail = resp.json().get("detail", resp.text)
43
+ except Exception:
44
+ detail = resp.text
45
+ raise WayyClientError(resp.status_code, detail)
46
+ if resp.status_code == 204 or not resp.content:
47
+ return {}
48
+ return resp.json()
49
+
50
+ # --- Health ---
51
+
52
+ def health(self) -> dict[str, Any]:
53
+ return self._request("GET", "/health")
54
+
55
+ def info(self) -> dict[str, Any]:
56
+ return self._request("GET", "/")
57
+
58
+ # --- Tables ---
59
+
60
+ def list_tables(self) -> list[str]:
61
+ data = self._request("GET", "/tables")
62
+ return data.get("tables", [])
63
+
64
+ def get_table_info(self, name: str) -> dict[str, Any]:
65
+ return self._request("GET", f"/tables/{name}")
66
+
67
+ def get_table_data(
68
+ self, name: str, limit: int = 100, offset: int = 0
69
+ ) -> dict[str, Any]:
70
+ return self._request(
71
+ "GET", f"/tables/{name}/data", params={"limit": limit, "offset": offset}
72
+ )
73
+
74
+ def create_table(
75
+ self,
76
+ name: str,
77
+ columns: list[dict[str, str]],
78
+ primary_key: Optional[str] = None,
79
+ sorted_by: Optional[str] = None,
80
+ ) -> dict[str, Any]:
81
+ payload = {
82
+ "name": name,
83
+ "columns": columns,
84
+ "primary_key": primary_key,
85
+ "sorted_by": sorted_by,
86
+ }
87
+ return self._request("POST", f"/api/v1/{_DB_NAME}/tables", json=payload)
88
+
89
+ def drop_table(self, name: str) -> dict[str, Any]:
90
+ return self._request("DELETE", f"/tables/{name}")
91
+
92
+ def upload_table(self, table_data: dict[str, Any]) -> dict[str, Any]:
93
+ return self._request("POST", "/tables/upload", json=table_data)
94
+
95
+ def append_rows(self, name: str, columns: list[dict[str, Any]]) -> dict[str, Any]:
96
+ return self._request("POST", f"/tables/{name}/append", json={"columns": columns})
97
+
98
+ # --- OLTP ---
99
+
100
+ def insert_row(self, table: str, data: dict[str, Any]) -> dict[str, Any]:
101
+ return self._request(
102
+ "POST", f"/api/v1/{_DB_NAME}/tables/{table}/rows", json={"data": data}
103
+ )
104
+
105
+ def get_row(self, table: str, pk: str) -> dict[str, Any]:
106
+ return self._request("GET", f"/api/v1/{_DB_NAME}/tables/{table}/rows/{pk}")
107
+
108
+ def update_row(self, table: str, pk: str, data: dict[str, Any]) -> dict[str, Any]:
109
+ return self._request(
110
+ "PUT", f"/api/v1/{_DB_NAME}/tables/{table}/rows/{pk}", json={"data": data}
111
+ )
112
+
113
+ def delete_row(self, table: str, pk: str) -> dict[str, Any]:
114
+ return self._request("DELETE", f"/api/v1/{_DB_NAME}/tables/{table}/rows/{pk}")
115
+
116
+ def filter_rows(
117
+ self, table: str, filters: Optional[dict[str, str]] = None, limit: int = 500
118
+ ) -> dict[str, Any]:
119
+ params = dict(filters or {})
120
+ params["limit"] = str(limit)
121
+ return self._request(
122
+ "GET", f"/api/v1/{_DB_NAME}/tables/{table}/rows", params=params
123
+ )
124
+
125
+ # --- Aggregations ---
126
+
127
+ def aggregate(self, table: str, column: str, op: str) -> dict[str, Any]:
128
+ return self._request("GET", f"/tables/{table}/agg/{column}/{op}")
129
+
130
+ # --- Joins ---
131
+
132
+ def as_of_join(
133
+ self, left: str, right: str, on: list[str], as_of: str
134
+ ) -> dict[str, Any]:
135
+ payload = {"left_table": left, "right_table": right, "on": on, "as_of": as_of}
136
+ return self._request("POST", "/join/aj", json=payload)
137
+
138
+ def window_join(
139
+ self,
140
+ left: str,
141
+ right: str,
142
+ on: list[str],
143
+ as_of: str,
144
+ before: int,
145
+ after: int,
146
+ ) -> dict[str, Any]:
147
+ payload = {
148
+ "left_table": left,
149
+ "right_table": right,
150
+ "on": on,
151
+ "as_of": as_of,
152
+ "window_before": before,
153
+ "window_after": after,
154
+ }
155
+ return self._request("POST", "/join/wj", json=payload)
156
+
157
+ # --- Window functions ---
158
+
159
+ def window_function(
160
+ self,
161
+ table: str,
162
+ column: str,
163
+ operation: str,
164
+ window: Optional[int] = None,
165
+ alpha: Optional[float] = None,
166
+ ) -> dict[str, Any]:
167
+ payload: dict[str, Any] = {
168
+ "table": table,
169
+ "column": column,
170
+ "operation": operation,
171
+ }
172
+ if window is not None:
173
+ payload["window"] = window
174
+ if alpha is not None:
175
+ payload["alpha"] = alpha
176
+ return self._request("POST", "/window", json=payload)
177
+
178
+ # --- Streaming ---
179
+
180
+ def ingest_tick(self, table: str, tick: dict[str, Any]) -> dict[str, Any]:
181
+ return self._request("POST", f"/ingest/{table}", json=tick)
182
+
183
+ def ingest_batch(self, table: str, ticks: list[dict[str, Any]]) -> dict[str, Any]:
184
+ return self._request("POST", f"/ingest/{table}/batch", json={"ticks": ticks})
185
+
186
+ def get_streaming_stats(self) -> dict[str, Any]:
187
+ return self._request("GET", "/streaming/stats")
188
+
189
+ def get_quote(self, table: str, symbol: str) -> dict[str, Any]:
190
+ return self._request("GET", f"/streaming/quote/{table}/{symbol}")
191
+
192
+ def get_all_quotes(self, table: str) -> dict[str, Any]:
193
+ return self._request("GET", f"/streaming/quotes/{table}")
194
+
195
+ # --- KV Store ---
196
+
197
+ def kv_get(self, key: str) -> Any:
198
+ data = self._request("GET", f"/kv/{key}")
199
+ return data.get("value")
200
+
201
+ def kv_set(self, key: str, value: Any, ttl: Optional[float] = None) -> dict[str, Any]:
202
+ payload: dict[str, Any] = {"value": value}
203
+ if ttl is not None:
204
+ payload["ttl"] = ttl
205
+ return self._request("POST", f"/kv/{key}", json=payload)
206
+
207
+ def kv_delete(self, key: str) -> dict[str, Any]:
208
+ return self._request("DELETE", f"/kv/{key}")
209
+
210
+ def kv_list(self, pattern: Optional[str] = None) -> list[str]:
211
+ params = {}
212
+ if pattern:
213
+ params["pattern"] = pattern
214
+ data = self._request("GET", "/kv", params=params)
215
+ return data.get("keys", [])
216
+
217
+ # --- Checkpoint ---
218
+
219
+ def checkpoint(self) -> dict[str, Any]:
220
+ return self._request("POST", f"/api/v1/{_DB_NAME}/checkpoint")
221
+
222
+ def close(self) -> None:
223
+ self._client.close()
224
+
225
+ def __enter__(self) -> "WayyClient":
226
+ return self
227
+
228
+ def __exit__(self, *args: Any) -> None:
229
+ self.close()
230
+
231
+
232
+ def upload_csv(
233
+ client: WayyClient, name: str, file_path: Path, sorted_by: Optional[str] = None
234
+ ) -> dict[str, Any]:
235
+ """Read a CSV file and upload it as a table.
236
+
237
+ Uses stdlib csv to avoid requiring pandas in CLI.
238
+ """
239
+ import csv
240
+
241
+ with open(file_path, newline="") as f:
242
+ reader = csv.reader(f)
243
+ headers = next(reader)
244
+ rows = list(reader)
245
+
246
+ if not rows:
247
+ raise ValueError("CSV file is empty (no data rows)")
248
+
249
+ columns: list[dict[str, Any]] = []
250
+ for i, header in enumerate(headers):
251
+ raw_values = [row[i] for row in rows]
252
+ dtype, data = _infer_column(raw_values)
253
+ columns.append({"name": header, "dtype": dtype, "data": data})
254
+
255
+ payload = {"name": name, "columns": columns, "sorted_by": sorted_by}
256
+ return client.upload_table(payload)
257
+
258
+
259
+ def _infer_column(values: list[str]) -> tuple[str, list[Any]]:
260
+ """Infer column dtype from string values. Returns (dtype_name, typed_data)."""
261
+ non_empty = [v for v in values if v.strip()]
262
+ if not non_empty:
263
+ return ("float64", [0.0] * len(values))
264
+
265
+ # Try int64
266
+ try:
267
+ data = [int(v) if v.strip() else 0 for v in values]
268
+ return ("int64", data)
269
+ except (ValueError, OverflowError):
270
+ pass
271
+
272
+ # Try float64 (handles empty cells as NaN)
273
+ try:
274
+ data = [float(v) if v.strip() else float("nan") for v in values]
275
+ return ("float64", data)
276
+ except (ValueError, OverflowError):
277
+ pass
278
+
279
+ raise ValueError(
280
+ f"Non-numeric column detected. Values: {values[:3]}... "
281
+ "CSV upload currently supports numeric columns only. "
282
+ "Use the Python API with from_pandas() for string/symbol columns."
283
+ )
284
+
285
+
286
+ def upload_json_ticks(
287
+ client: WayyClient, table: str, file_path: Path
288
+ ) -> dict[str, Any]:
289
+ """Read a JSON file of ticks and batch-ingest them."""
290
+ with open(file_path) as f:
291
+ data = json.load(f)
292
+
293
+ if isinstance(data, list):
294
+ ticks = data
295
+ elif isinstance(data, dict) and "ticks" in data:
296
+ ticks = data["ticks"]
297
+ else:
298
+ raise ValueError("JSON must be a list of ticks or {\"ticks\": [...]}")
299
+
300
+ return client.ingest_batch(table, ticks)
python/wayy_db/cli/config.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration management for the WayyDB CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ CONFIG_DIR = Path.home() / ".wayy"
11
+ CONFIG_FILE = CONFIG_DIR / "config.json"
12
+
13
+ DEFAULTS: dict[str, Any] = {
14
+ "server_url": "http://localhost:8080",
15
+ "format": "table",
16
+ "db_name": "default",
17
+ }
18
+
19
+
20
+ def load_config() -> dict[str, Any]:
21
+ """Load config from ~/.wayy/config.json, creating defaults if missing."""
22
+ if CONFIG_FILE.exists():
23
+ with open(CONFIG_FILE) as f:
24
+ return {**DEFAULTS, **json.load(f)}
25
+ return dict(DEFAULTS)
26
+
27
+
28
+ def save_config(config: dict[str, Any]) -> None:
29
+ """Save config to ~/.wayy/config.json."""
30
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
31
+ with open(CONFIG_FILE, "w") as f:
32
+ json.dump(config, f, indent=2)
33
+
34
+
35
+ def get_server_url() -> str:
36
+ """Get the configured server URL."""
37
+ return load_config()["server_url"]
38
+
39
+
40
+ def get_db_name() -> str:
41
+ """Get the configured database name."""
42
+ return load_config()["db_name"]
python/wayy_db/cli/deploy.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deployment commands for the WayyDB CLI.
2
+
3
+ Supports:
4
+ - Local: start uvicorn directly or via Docker
5
+ - HuggingFace Spaces: push to HF Docker space
6
+ - Docker: build and run container
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import shutil
13
+ import subprocess
14
+ import sys
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ import typer
19
+
20
+ from wayy_db.cli.config import load_config, save_config
21
+ from wayy_db.cli.output import console, print_error, print_info, print_success
22
+
23
+ deploy_app = typer.Typer(
24
+ name="deploy",
25
+ help="Deploy WayyDB service",
26
+ no_args_is_help=True,
27
+ )
28
+
29
+
30
+ def _find_project_root() -> Path:
31
+ """Walk up from cwd looking for pyproject.toml with wayy-db."""
32
+ cwd = Path.cwd()
33
+ for parent in [cwd, *cwd.parents]:
34
+ toml = parent / "pyproject.toml"
35
+ if toml.exists() and "wayy-db" in toml.read_text():
36
+ return parent
37
+ raise FileNotFoundError(
38
+ "Cannot find wayyDB project root (no pyproject.toml with wayy-db found). "
39
+ "Run this command from within the wayyDB repo."
40
+ )
41
+
42
+
43
+ def _run(cmd: list[str], cwd: Optional[Path] = None, check: bool = True) -> subprocess.CompletedProcess[str]:
44
+ """Run a subprocess with live output."""
45
+ console.print(f"[dim]$ {' '.join(cmd)}[/dim]")
46
+ return subprocess.run(cmd, cwd=cwd, check=check, text=True)
47
+
48
+
49
+ # --- Local serve ---
50
+
51
+
52
+ @deploy_app.command("local")
53
+ def deploy_local(
54
+ port: int = typer.Option(8080, "--port", "-p", help="Port to serve on"),
55
+ host: str = typer.Option("0.0.0.0", "--host", help="Host to bind to"),
56
+ data_path: str = typer.Option("./data/wayydb", "--data-path", "-d", help="Data directory"),
57
+ workers: int = typer.Option(1, "--workers", "-w", help="Number of uvicorn workers"),
58
+ ) -> None:
59
+ """Start WayyDB server locally with uvicorn."""
60
+ os.makedirs(data_path, exist_ok=True)
61
+ os.environ["WAYY_DATA_PATH"] = str(Path(data_path).resolve())
62
+ os.environ["PORT"] = str(port)
63
+ os.environ["CORS_ORIGINS"] = "*"
64
+
65
+ print_info("Data path", os.environ["WAYY_DATA_PATH"])
66
+ print_info("Serving on", f"http://{host}:{port}")
67
+ console.print("[dim]Press Ctrl+C to stop[/dim]\n")
68
+
69
+ try:
70
+ _find_project_root()
71
+ # Running from source — use api.main:app
72
+ api_module = "api.main:app"
73
+ except FileNotFoundError:
74
+ # Installed package — api module should be importable
75
+ api_module = "api.main:app"
76
+
77
+ cmd = [
78
+ sys.executable, "-m", "uvicorn", api_module,
79
+ "--host", host,
80
+ "--port", str(port),
81
+ "--workers", str(workers),
82
+ ]
83
+
84
+ try:
85
+ _run(cmd)
86
+ except KeyboardInterrupt:
87
+ console.print("\n[dim]Server stopped.[/dim]")
88
+ except subprocess.CalledProcessError:
89
+ print_error("Failed to start server. Is uvicorn installed? (pip install wayy-db[api])")
90
+ raise typer.Exit(1)
91
+
92
+
93
+ # --- Docker ---
94
+
95
+
96
+ @deploy_app.command("docker")
97
+ def deploy_docker(
98
+ port: int = typer.Option(8080, "--port", "-p", help="Host port to expose"),
99
+ tag: str = typer.Option("wayydb:latest", "--tag", "-t", help="Docker image tag"),
100
+ data_volume: str = typer.Option("wayydb-data", "--volume", "-v", help="Docker volume for data persistence"),
101
+ build: bool = typer.Option(True, "--build/--no-build", help="Build image before running"),
102
+ detach: bool = typer.Option(True, "--detach/--foreground", help="Run in background"),
103
+ ) -> None:
104
+ """Build and run WayyDB in Docker."""
105
+ if not shutil.which("docker"):
106
+ print_error("Docker not found. Install Docker: https://docs.docker.com/get-docker/")
107
+ raise typer.Exit(1)
108
+
109
+ try:
110
+ root = _find_project_root()
111
+ except FileNotFoundError as e:
112
+ print_error(str(e))
113
+ raise typer.Exit(1)
114
+
115
+ if build:
116
+ console.print("[bold]Building Docker image...[/bold]")
117
+ _run(["docker", "build", "-t", tag, "."], cwd=root)
118
+ print_success(f"Built {tag}")
119
+
120
+ # Create volume if needed
121
+ _run(["docker", "volume", "create", data_volume], check=False)
122
+
123
+ # Stop existing container if running
124
+ _run(["docker", "rm", "-f", "wayydb"], check=False)
125
+
126
+ cmd = [
127
+ "docker", "run",
128
+ "--name", "wayydb",
129
+ "-p", f"{port}:8080",
130
+ "-v", f"{data_volume}:/data/wayydb",
131
+ "-e", "CORS_ORIGINS=*",
132
+ ]
133
+
134
+ if detach:
135
+ cmd.append("-d")
136
+
137
+ cmd.append(tag)
138
+
139
+ _run(cmd)
140
+
141
+ if detach:
142
+ print_success(f"WayyDB running at http://localhost:{port}")
143
+ print_info("Container", "wayydb")
144
+ print_info("Volume", data_volume)
145
+ console.print("[dim]Stop with: docker stop wayydb[/dim]")
146
+ else:
147
+ console.print("\n[dim]Container stopped.[/dim]")
148
+
149
+
150
+ # --- HuggingFace Spaces ---
151
+
152
+
153
+ @deploy_app.command("hf")
154
+ def deploy_hf(
155
+ repo: str = typer.Option("", "--repo", "-r", help="HF Space repo (user/name). Uses git remote 'hf' if not set."),
156
+ token: Optional[str] = typer.Option(None, "--token", help="HuggingFace token (or set HF_TOKEN env var)"),
157
+ ) -> None:
158
+ """Deploy WayyDB to HuggingFace Spaces (Docker).
159
+
160
+ Pushes the current repo state to a HuggingFace Space configured as a Docker space.
161
+ The Space must already exist. Create one at: https://huggingface.co/new-space?sdk=docker
162
+ """
163
+ if not shutil.which("git"):
164
+ print_error("git not found")
165
+ raise typer.Exit(1)
166
+
167
+ try:
168
+ root = _find_project_root()
169
+ except FileNotFoundError as e:
170
+ print_error(str(e))
171
+ raise typer.Exit(1)
172
+
173
+ # Check if hf remote exists
174
+ result = subprocess.run(
175
+ ["git", "remote", "get-url", "hf"], capture_output=True, text=True, cwd=root
176
+ )
177
+ hf_remote_exists = result.returncode == 0
178
+ existing_url = result.stdout.strip() if hf_remote_exists else ""
179
+
180
+ if repo:
181
+ hf_token = token or os.environ.get("HF_TOKEN", "")
182
+ if hf_token:
183
+ remote_url = f"https://user:{hf_token}@huggingface.co/spaces/{repo}"
184
+ else:
185
+ remote_url = f"https://huggingface.co/spaces/{repo}"
186
+
187
+ if hf_remote_exists:
188
+ _run(["git", "remote", "set-url", "hf", remote_url], cwd=root)
189
+ else:
190
+ _run(["git", "remote", "add", "hf", remote_url], cwd=root)
191
+ elif not hf_remote_exists:
192
+ print_error(
193
+ "No 'hf' git remote found. Either:\n"
194
+ " 1. Run: wayy deploy hf --repo <user>/<space-name>\n"
195
+ " 2. Add manually: git remote add hf https://huggingface.co/spaces/<user>/<name>"
196
+ )
197
+ raise typer.Exit(1)
198
+
199
+ console.print("[bold]Pushing to HuggingFace Spaces...[/bold]")
200
+
201
+ # HF Spaces rejects pushes containing large files in history (even deleted ones).
202
+ # Create a clean orphan commit with only the current tree to avoid this.
203
+ try:
204
+ # Create a temporary orphan branch with just the current working tree
205
+ _run(["git", "checkout", "--orphan", "_hf_deploy"], cwd=root)
206
+ _run(["git", "add", "-A"], cwd=root)
207
+ _run(
208
+ ["git", "commit", "-m", "Deploy wayyDB to HuggingFace Spaces", "--allow-empty"],
209
+ cwd=root,
210
+ )
211
+ _run(["git", "push", "hf", "_hf_deploy:main", "--force"], cwd=root)
212
+ except subprocess.CalledProcessError:
213
+ # Clean up temp branch before erroring
214
+ subprocess.run(["git", "checkout", "main"], cwd=root, capture_output=True)
215
+ subprocess.run(["git", "branch", "-D", "_hf_deploy"], cwd=root, capture_output=True)
216
+ print_error("Push failed. Check your HF token and Space configuration.")
217
+ raise typer.Exit(1)
218
+ finally:
219
+ # Always return to main branch and clean up
220
+ subprocess.run(["git", "checkout", "main"], cwd=root, capture_output=True)
221
+ subprocess.run(["git", "branch", "-D", "_hf_deploy"], cwd=root, capture_output=True)
222
+
223
+ # Extract space URL from remote
224
+ result = subprocess.run(
225
+ ["git", "remote", "get-url", "hf"], capture_output=True, text=True, cwd=root
226
+ )
227
+ remote_url = result.stdout.strip()
228
+
229
+ # Parse space name from URL
230
+ space_name = ""
231
+ if "huggingface.co/spaces/" in remote_url:
232
+ space_name = remote_url.split("huggingface.co/spaces/")[-1].rstrip(".git")
233
+ elif repo:
234
+ space_name = repo
235
+
236
+ if space_name:
237
+ space_url = f"https://huggingface.co/spaces/{space_name}"
238
+ # HF Spaces with Docker get a direct URL
239
+ space_direct = f"https://{space_name.replace('/', '-')}.hf.space"
240
+ print_success(f"Deployed to HuggingFace Spaces")
241
+ print_info("Space", space_url)
242
+ print_info("API", space_direct)
243
+ console.print(f"\n[dim]Connect with: wayy connect {space_direct}[/dim]")
244
+ else:
245
+ print_success("Pushed to HuggingFace Spaces")
246
+
247
+
248
+ # --- Status / logs ---
249
+
250
+
251
+ @deploy_app.command("stop")
252
+ def deploy_stop(
253
+ name: str = typer.Option("wayydb", "--name", "-n", help="Container name"),
254
+ ) -> None:
255
+ """Stop a running WayyDB Docker container."""
256
+ if not shutil.which("docker"):
257
+ print_error("Docker not found")
258
+ raise typer.Exit(1)
259
+
260
+ _run(["docker", "stop", name], check=False)
261
+ _run(["docker", "rm", name], check=False)
262
+ print_success(f"Stopped {name}")
263
+
264
+
265
+ @deploy_app.command("logs")
266
+ def deploy_logs(
267
+ name: str = typer.Option("wayydb", "--name", "-n", help="Container name"),
268
+ follow: bool = typer.Option(False, "--follow", "-f", help="Follow log output"),
269
+ tail: int = typer.Option(100, "--tail", help="Number of lines to show"),
270
+ ) -> None:
271
+ """View logs from a running WayyDB Docker container."""
272
+ if not shutil.which("docker"):
273
+ print_error("Docker not found")
274
+ raise typer.Exit(1)
275
+
276
+ cmd = ["docker", "logs", "--tail", str(tail)]
277
+ if follow:
278
+ cmd.append("-f")
279
+ cmd.append(name)
280
+
281
+ try:
282
+ _run(cmd, check=False)
283
+ except KeyboardInterrupt:
284
+ pass
python/wayy_db/cli/main.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """WayyDB CLI — command-line interface for the WayyDB service.
2
+
3
+ Usage:
4
+ wayy status Check server health
5
+ wayy connect <url> Set server URL
6
+ wayy tables List all tables
7
+ wayy create <name> --schema '{}' Create a table with schema
8
+ wayy query <table> Query table data
9
+ wayy upload <name> --file data.csv Upload CSV as a table
10
+ wayy agg <table> <col> <op> Run aggregation
11
+ wayy stream <table> Subscribe to live updates
12
+ wayy ingest <table> --file ticks.json Batch ingest ticks
13
+ wayy kv get/set/del <key> Key-value operations
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ from pathlib import Path
20
+ from typing import Any, NoReturn, Optional
21
+
22
+ import typer
23
+
24
+ from wayy_db.cli.client import WayyClient, WayyClientError, upload_csv, upload_json_ticks
25
+ from wayy_db.cli.config import get_server_url, load_config, save_config
26
+ from wayy_db.cli.deploy import deploy_app
27
+ from wayy_db.cli.output import (
28
+ console,
29
+ print_error,
30
+ print_info,
31
+ print_json_data,
32
+ print_kv,
33
+ print_rows,
34
+ print_success,
35
+ print_table_data,
36
+ )
37
+
38
+ app = typer.Typer(
39
+ name="wayy",
40
+ help="WayyDB CLI — high-performance columnar time-series database",
41
+ no_args_is_help=True,
42
+ add_completion=False,
43
+ )
44
+
45
+
46
+ def _handle_error(e: WayyClientError) -> NoReturn:
47
+ if e.status_code == 0:
48
+ print_error(f"Connection failed: {e.detail}")
49
+ else:
50
+ print_error(f"Error {e.status_code}: {e.detail}")
51
+ raise typer.Exit(1)
52
+
53
+
54
+ # --- Connection ---
55
+
56
+
57
+ @app.command()
58
+ def connect(url: str = typer.Argument(..., help="WayyDB server URL")) -> None:
59
+ """Set the WayyDB server URL."""
60
+ url = url.rstrip("/")
61
+ if not url.startswith(("http://", "https://")):
62
+ url = f"http://{url}"
63
+
64
+ try:
65
+ with WayyClient(base_url=url) as client:
66
+ info = client.health()
67
+ except WayyClientError as e:
68
+ print_error(f"Cannot reach {url}: {e.detail}")
69
+ raise typer.Exit(1)
70
+
71
+ config = load_config()
72
+ config["server_url"] = url
73
+ save_config(config)
74
+ print_success(f"Connected to {url}")
75
+ print_info("Tables", info.get("tables", 0))
76
+
77
+
78
+ @app.command()
79
+ def status() -> None:
80
+ """Check server health and connection info."""
81
+ url = get_server_url()
82
+ print_info("Server", url)
83
+
84
+ try:
85
+ with WayyClient() as client:
86
+ info = client.info()
87
+ health = client.health()
88
+ except WayyClientError as e:
89
+ _handle_error(e)
90
+
91
+ print_info("Service", info.get("service", "?"))
92
+ print_info("Version", info.get("version", "?"))
93
+ print_info("Status", health.get("status", "?"))
94
+ print_info("Tables", health.get("tables", 0))
95
+
96
+
97
+ # --- Tables ---
98
+
99
+
100
+ @app.command()
101
+ def tables() -> None:
102
+ """List all tables in the database."""
103
+ try:
104
+ with WayyClient() as client:
105
+ table_list = client.list_tables()
106
+ except WayyClientError as e:
107
+ _handle_error(e)
108
+
109
+ if not table_list:
110
+ console.print("[dim]No tables[/dim]")
111
+ return
112
+
113
+ for t in table_list:
114
+ console.print(f" {t}")
115
+
116
+
117
+ @app.command()
118
+ def create(
119
+ name: str = typer.Argument(..., help="Table name"),
120
+ schema: str = typer.Option(
121
+ ..., "--schema", "-s",
122
+ help='Column schema as JSON: \'{"ts": "timestamp", "price": "float64"}\'',
123
+ ),
124
+ primary_key: Optional[str] = typer.Option(None, "--pk", help="Primary key column"),
125
+ sorted_by: Optional[str] = typer.Option(None, "--sorted-by", help="Sorted index column"),
126
+ ) -> None:
127
+ """Create a new table with a typed schema."""
128
+ try:
129
+ schema_dict = json.loads(schema)
130
+ except json.JSONDecodeError as e:
131
+ print_error(f"Invalid JSON schema: {e}")
132
+ raise typer.Exit(1)
133
+
134
+ columns = [{"name": k, "dtype": v} for k, v in schema_dict.items()]
135
+
136
+ try:
137
+ with WayyClient() as client:
138
+ result = client.create_table(name, columns, primary_key=primary_key, sorted_by=sorted_by)
139
+ except WayyClientError as e:
140
+ _handle_error(e)
141
+
142
+ print_success(f"Created table '{name}' with columns: {result.get('columns', [])}")
143
+
144
+
145
+ @app.command()
146
+ def drop(name: str = typer.Argument(..., help="Table name to delete")) -> None:
147
+ """Drop a table."""
148
+ try:
149
+ with WayyClient() as client:
150
+ client.drop_table(name)
151
+ except WayyClientError as e:
152
+ _handle_error(e)
153
+
154
+ print_success(f"Dropped table '{name}'")
155
+
156
+
157
+ @app.command()
158
+ def info(name: str = typer.Argument(..., help="Table name")) -> None:
159
+ """Get table metadata."""
160
+ try:
161
+ with WayyClient() as client:
162
+ data = client.get_table_info(name)
163
+ except WayyClientError as e:
164
+ _handle_error(e)
165
+
166
+ print_info("Name", data.get("name"))
167
+ print_info("Rows", data.get("num_rows"))
168
+ print_info("Columns", data.get("num_columns"))
169
+ print_info("Column names", ", ".join(data.get("columns", [])))
170
+ print_info("Sorted by", data.get("sorted_by") or "none")
171
+
172
+
173
+ @app.command()
174
+ def query(
175
+ table: str = typer.Argument(..., help="Table name"),
176
+ limit: int = typer.Option(100, "--limit", "-n", help="Max rows to return"),
177
+ offset: int = typer.Option(0, "--offset", help="Row offset"),
178
+ where: Optional[list[str]] = typer.Option(None, "--where", "-w", help="Filter as col=val"),
179
+ output_json: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
180
+ ) -> None:
181
+ """Query table data."""
182
+ try:
183
+ with WayyClient() as client:
184
+ if where:
185
+ filters = {}
186
+ for w in where:
187
+ if "=" not in w:
188
+ print_error(f"Invalid filter: {w} (expected col=val)")
189
+ raise typer.Exit(1)
190
+ k, v = w.split("=", 1)
191
+ filters[k] = v
192
+
193
+ result = client.filter_rows(table, filters=filters, limit=limit)
194
+
195
+ if output_json:
196
+ print_json_data(result)
197
+ else:
198
+ print_rows(result.get("data", []), title=f"{table} ({result.get('count', 0)} rows)")
199
+ else:
200
+ result = client.get_table_data(table, limit=limit, offset=offset)
201
+
202
+ if output_json:
203
+ print_json_data(result)
204
+ else:
205
+ data = result.get("data", {})
206
+ total = result.get("total_rows", 0)
207
+ shown = len(next(iter(data.values()))) if data else 0
208
+ print_table_data(data, title=f"{table} ({shown}/{total} rows)")
209
+
210
+ except WayyClientError as e:
211
+ _handle_error(e)
212
+
213
+
214
+ @app.command()
215
+ def upload(
216
+ name: str = typer.Argument(..., help="Table name"),
217
+ file: Path = typer.Option(..., "--file", "-f", help="CSV file to upload"),
218
+ sorted_by: Optional[str] = typer.Option(None, "--sorted-by", help="Sorted index column"),
219
+ ) -> None:
220
+ """Upload a CSV file as a new table."""
221
+ if not file.exists():
222
+ print_error(f"File not found: {file}")
223
+ raise typer.Exit(1)
224
+
225
+ try:
226
+ with WayyClient() as client:
227
+ result = upload_csv(client, name, file, sorted_by=sorted_by)
228
+ except WayyClientError as e:
229
+ _handle_error(e)
230
+ except ValueError as e:
231
+ print_error(str(e))
232
+ raise typer.Exit(1)
233
+
234
+ print_success(f"Uploaded '{name}': {result.get('rows', 0)} rows, columns: {result.get('columns', [])}")
235
+
236
+
237
+ # --- Aggregations ---
238
+
239
+
240
+ @app.command()
241
+ def agg(
242
+ table: str = typer.Argument(..., help="Table name"),
243
+ column: str = typer.Argument(..., help="Column name"),
244
+ op: str = typer.Argument(..., help="Operation: sum, avg, min, max, std"),
245
+ ) -> None:
246
+ """Run an aggregation on a table column."""
247
+ try:
248
+ with WayyClient() as client:
249
+ result = client.aggregate(table, column, op)
250
+ except WayyClientError as e:
251
+ _handle_error(e)
252
+
253
+ console.print(f"[bold]{op}[/bold]({table}.{column}) = [cyan]{result.get('result')}[/cyan]")
254
+
255
+
256
+ # --- Streaming ---
257
+
258
+
259
+ @app.command()
260
+ def stream(
261
+ table: str = typer.Argument(..., help="Table name to subscribe to"),
262
+ symbols: Optional[str] = typer.Option(None, "--symbols", "-s", help="Comma-separated symbol filter"),
263
+ output_json: bool = typer.Option(False, "--json", "-j", help="Output raw JSON"),
264
+ ) -> None:
265
+ """Subscribe to real-time streaming updates via WebSocket."""
266
+ import asyncio
267
+
268
+ async def _stream() -> None:
269
+ import websockets
270
+
271
+ url = get_server_url().replace("http://", "ws://").replace("https://", "wss://")
272
+ ws_url = f"{url}/ws/subscribe/{table}"
273
+
274
+ console.print(f"[dim]Connecting to {ws_url}...[/dim]")
275
+
276
+ async with websockets.connect(ws_url) as ws:
277
+ if symbols:
278
+ symbol_list = [s.strip() for s in symbols.split(",")]
279
+ await ws.send(json.dumps({"symbols": symbol_list}))
280
+ console.print(f"[dim]Filtering: {symbol_list}[/dim]")
281
+
282
+ console.print("[green]Connected.[/green] Press Ctrl+C to disconnect.\n")
283
+
284
+ try:
285
+ async for message in ws:
286
+ data = json.loads(message)
287
+ if output_json:
288
+ print_json_data(data)
289
+ else:
290
+ if "batch" in data:
291
+ for tick in data["batch"]:
292
+ _print_tick(tick)
293
+ else:
294
+ _print_tick(data)
295
+ except asyncio.CancelledError:
296
+ pass
297
+
298
+ try:
299
+ asyncio.run(_stream())
300
+ except KeyboardInterrupt:
301
+ console.print("\n[dim]Disconnected.[/dim]")
302
+
303
+
304
+ def _print_tick(tick: dict[str, Any]) -> None:
305
+ """Format a single tick for display."""
306
+ sym = tick.get("symbol", "?")
307
+ price = tick.get("price", "?")
308
+ vol = tick.get("volume", "")
309
+ bid = tick.get("bid", "")
310
+ ask = tick.get("ask", "")
311
+
312
+ parts = [f"[bold]{sym}[/bold]", f"[cyan]{price}[/cyan]"]
313
+ if bid and ask:
314
+ parts.append(f"[dim]{bid}/{ask}[/dim]")
315
+ if vol:
316
+ parts.append(f"vol={vol}")
317
+
318
+ console.print(" ".join(parts))
319
+
320
+
321
+ # --- Ingestion ---
322
+
323
+
324
+ @app.command()
325
+ def ingest(
326
+ table: str = typer.Argument(..., help="Table name"),
327
+ file: Path = typer.Option(..., "--file", "-f", help="JSON file with ticks"),
328
+ ) -> None:
329
+ """Batch ingest ticks from a JSON file."""
330
+ if not file.exists():
331
+ print_error(f"File not found: {file}")
332
+ raise typer.Exit(1)
333
+
334
+ try:
335
+ with WayyClient() as client:
336
+ result = upload_json_ticks(client, table, file)
337
+ except WayyClientError as e:
338
+ _handle_error(e)
339
+ except ValueError as e:
340
+ print_error(str(e))
341
+ raise typer.Exit(1)
342
+
343
+ print_success(f"Ingested {result.get('ingested', 0)} ticks into '{table}'")
344
+
345
+
346
+ # --- KV Store ---
347
+
348
+ kv_app = typer.Typer(name="kv", help="Key-value store operations", no_args_is_help=True)
349
+ app.add_typer(kv_app)
350
+
351
+
352
+ @kv_app.command("get")
353
+ def kv_get(key: str = typer.Argument(..., help="Key to get")) -> None:
354
+ """Get a value by key."""
355
+ try:
356
+ with WayyClient() as client:
357
+ value = client.kv_get(key)
358
+ except WayyClientError as e:
359
+ _handle_error(e)
360
+
361
+ print_kv(key, value)
362
+
363
+
364
+ @kv_app.command("set")
365
+ def kv_set(
366
+ key: str = typer.Argument(..., help="Key to set"),
367
+ value: str = typer.Argument(..., help="Value (JSON or string)"),
368
+ ttl: Optional[float] = typer.Option(None, "--ttl", help="TTL in seconds"),
369
+ ) -> None:
370
+ """Set a key-value pair."""
371
+ try:
372
+ parsed = json.loads(value)
373
+ except json.JSONDecodeError:
374
+ parsed = value
375
+
376
+ try:
377
+ with WayyClient() as client:
378
+ client.kv_set(key, parsed, ttl=ttl)
379
+ except WayyClientError as e:
380
+ _handle_error(e)
381
+
382
+ print_success(f"Set '{key}'")
383
+
384
+
385
+ @kv_app.command("del")
386
+ def kv_del(key: str = typer.Argument(..., help="Key to delete")) -> None:
387
+ """Delete a key."""
388
+ try:
389
+ with WayyClient() as client:
390
+ client.kv_delete(key)
391
+ except WayyClientError as e:
392
+ _handle_error(e)
393
+
394
+ print_success(f"Deleted '{key}'")
395
+
396
+
397
+ @kv_app.command("list")
398
+ def kv_list(pattern: Optional[str] = typer.Argument(None, help="Glob pattern filter")) -> None:
399
+ """List keys, optionally filtered by pattern."""
400
+ try:
401
+ with WayyClient() as client:
402
+ keys = client.kv_list(pattern)
403
+ except WayyClientError as e:
404
+ _handle_error(e)
405
+
406
+ if not keys:
407
+ console.print("[dim]No keys[/dim]")
408
+ return
409
+
410
+ for k in keys:
411
+ console.print(f" {k}")
412
+
413
+
414
+ # --- Joins ---
415
+
416
+ join_app = typer.Typer(name="join", help="Join operations", no_args_is_help=True)
417
+ app.add_typer(join_app)
418
+
419
+
420
+ @join_app.command("aj")
421
+ def join_aj(
422
+ left: str = typer.Argument(..., help="Left table"),
423
+ right: str = typer.Argument(..., help="Right table"),
424
+ on: str = typer.Option(..., "--on", help="Join keys (comma-separated)"),
425
+ as_of: str = typer.Option(..., "--as-of", help="Temporal column"),
426
+ output_json: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
427
+ ) -> None:
428
+ """As-of join: find most recent right row for each left row."""
429
+ on_cols = [c.strip() for c in on.split(",")]
430
+
431
+ try:
432
+ with WayyClient() as client:
433
+ result = client.as_of_join(left, right, on_cols, as_of)
434
+ except WayyClientError as e:
435
+ _handle_error(e)
436
+
437
+ if output_json:
438
+ print_json_data(result)
439
+ else:
440
+ print_table_data(result.get("data", {}), title=f"aj({left}, {right}) — {result.get('rows', 0)} rows")
441
+
442
+
443
+ @join_app.command("wj")
444
+ def join_wj(
445
+ left: str = typer.Argument(..., help="Left table"),
446
+ right: str = typer.Argument(..., help="Right table"),
447
+ on: str = typer.Option(..., "--on", help="Join keys (comma-separated)"),
448
+ as_of: str = typer.Option(..., "--as-of", help="Temporal column"),
449
+ before: int = typer.Option(..., "--before", help="Window before (nanoseconds)"),
450
+ after: int = typer.Option(..., "--after", help="Window after (nanoseconds)"),
451
+ output_json: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
452
+ ) -> None:
453
+ """Window join: find all right rows within time window."""
454
+ on_cols = [c.strip() for c in on.split(",")]
455
+
456
+ try:
457
+ with WayyClient() as client:
458
+ result = client.window_join(left, right, on_cols, as_of, before, after)
459
+ except WayyClientError as e:
460
+ _handle_error(e)
461
+
462
+ if output_json:
463
+ print_json_data(result)
464
+ else:
465
+ print_table_data(result.get("data", {}), title=f"wj({left}, {right}) — {result.get('rows', 0)} rows")
466
+
467
+
468
+ # --- Window Functions ---
469
+
470
+
471
+ @app.command("window")
472
+ def window_fn(
473
+ table: str = typer.Argument(..., help="Table name"),
474
+ column: str = typer.Argument(..., help="Column name"),
475
+ op: str = typer.Argument(..., help="Operation: mavg, msum, mstd, mmin, mmax, ema, diff, pct_change"),
476
+ window: Optional[int] = typer.Option(None, "--window", "-w", help="Window size"),
477
+ alpha: Optional[float] = typer.Option(None, "--alpha", help="EMA alpha"),
478
+ output_json: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
479
+ ) -> None:
480
+ """Apply a window function to a column."""
481
+ try:
482
+ with WayyClient() as client:
483
+ result = client.window_function(table, column, op, window=window, alpha=alpha)
484
+ except WayyClientError as e:
485
+ _handle_error(e)
486
+
487
+ if output_json:
488
+ print_json_data(result)
489
+ else:
490
+ values = result.get("result", [])
491
+ console.print(f"[bold]{op}[/bold]({table}.{column}) — {len(values)} values")
492
+ if len(values) <= 20:
493
+ for v in values:
494
+ console.print(f" {v}")
495
+ else:
496
+ for v in values[:5]:
497
+ console.print(f" {v}")
498
+ console.print(f" ... ({len(values) - 10} more)")
499
+ for v in values[-5:]:
500
+ console.print(f" {v}")
501
+
502
+
503
+ # --- Checkpoint ---
504
+
505
+
506
+ @app.command()
507
+ def checkpoint() -> None:
508
+ """Flush WAL and save all tables to disk."""
509
+ try:
510
+ with WayyClient() as client:
511
+ client.checkpoint()
512
+ except WayyClientError as e:
513
+ _handle_error(e)
514
+
515
+ print_success("Checkpoint complete")
516
+
517
+
518
+ app.add_typer(deploy_app)
519
+
520
+
521
+ if __name__ == "__main__":
522
+ app()
python/wayy_db/cli/output.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Output formatting for the WayyDB CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from typing import Any
8
+
9
+ from rich.console import Console
10
+ from rich.json import JSON
11
+ from rich.table import Table
12
+
13
+ console = Console()
14
+ err_console = Console(stderr=True)
15
+
16
+
17
+ def print_json_data(data: Any) -> None:
18
+ """Pretty-print JSON data."""
19
+ console.print(JSON(json.dumps(data, default=str)))
20
+
21
+
22
+ def print_table_data(data: dict[str, list[Any]], title: str = "") -> None:
23
+ """Render columnar data as a rich table."""
24
+ if not data:
25
+ console.print("[dim]No data[/dim]")
26
+ return
27
+
28
+ table = Table(title=title, show_lines=False)
29
+ columns = list(data.keys())
30
+ for col in columns:
31
+ table.add_column(col, style="cyan")
32
+
33
+ num_rows = len(next(iter(data.values())))
34
+ for i in range(num_rows):
35
+ row = [str(data[col][i]) for col in columns]
36
+ table.add_row(*row)
37
+
38
+ console.print(table)
39
+
40
+
41
+ def print_rows(rows: list[dict[str, Any]], title: str = "") -> None:
42
+ """Render a list of row dicts as a rich table."""
43
+ if not rows:
44
+ console.print("[dim]No rows[/dim]")
45
+ return
46
+
47
+ columns = list(rows[0].keys())
48
+ table = Table(title=title, show_lines=False)
49
+ for col in columns:
50
+ table.add_column(col, style="cyan")
51
+
52
+ for row in rows:
53
+ table.add_row(*[str(row.get(col, "")) for col in columns])
54
+
55
+ console.print(table)
56
+
57
+
58
+ def print_kv(key: str, value: Any) -> None:
59
+ """Print a KV pair."""
60
+ console.print(f"[bold]{key}[/bold] = ", end="")
61
+ if isinstance(value, (dict, list)):
62
+ print_json_data(value)
63
+ else:
64
+ console.print(str(value))
65
+
66
+
67
+ def print_success(msg: str) -> None:
68
+ console.print(f"[green]{msg}[/green]")
69
+
70
+
71
+ def print_error(msg: str) -> None:
72
+ err_console.print(f"[red]{msg}[/red]")
73
+
74
+
75
+ def print_info(label: str, value: Any) -> None:
76
+ console.print(f"[bold]{label}:[/bold] {value}")
python/wayy_db/ops.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WayyDB Operations
3
+
4
+ High-performance operations for time-series analysis:
5
+ - Temporal joins (aj, wj)
6
+ - SIMD aggregations (sum, avg, min, max, std)
7
+ - Window functions (mavg, msum, mstd, ema, etc.)
8
+ """
9
+
10
+ from wayy_db._core import ops as _ops
11
+
12
+ # Re-export all operations from C++ module
13
+ from wayy_db._core.ops import (
14
+ # Aggregations
15
+ sum,
16
+ avg,
17
+ min,
18
+ max,
19
+ std,
20
+ # Joins
21
+ aj,
22
+ wj,
23
+ # Window functions
24
+ mavg,
25
+ msum,
26
+ mstd,
27
+ mmin,
28
+ mmax,
29
+ ema,
30
+ diff,
31
+ pct_change,
32
+ shift,
33
+ )
34
+
35
+ __all__ = [
36
+ # Aggregations
37
+ "sum",
38
+ "avg",
39
+ "min",
40
+ "max",
41
+ "std",
42
+ # Joins
43
+ "aj",
44
+ "wj",
45
+ # Window functions
46
+ "mavg",
47
+ "msum",
48
+ "mstd",
49
+ "mmin",
50
+ "mmax",
51
+ "ema",
52
+ "diff",
53
+ "pct_change",
54
+ "shift",
55
+ ]
src/column.cpp ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/column.hpp"
2
+
3
+ #include <cstring>
4
+
5
+ namespace wayy_db {
6
+
7
+ Column::Column(std::string name, DType dtype, std::vector<uint8_t> data)
8
+ : name_(std::move(name))
9
+ , dtype_(dtype)
10
+ , size_(dtype_size(dtype) > 0 ? data.size() / dtype_size(dtype) : 0)
11
+ , owns_data_(true)
12
+ , owned_data_(std::move(data)) {
13
+ data_ = owned_data_.data();
14
+ }
15
+
16
+ Column::Column(std::string name, DType dtype, void* data, size_t size, bool owns_data)
17
+ : name_(std::move(name))
18
+ , dtype_(dtype)
19
+ , data_(data)
20
+ , size_(size)
21
+ , owns_data_(owns_data) {
22
+ if (owns_data && data != nullptr && dtype_size(dtype) > 0) {
23
+ // Copy data into owned buffer
24
+ size_t byte_size = size * dtype_size(dtype);
25
+ owned_data_.resize(byte_size);
26
+ std::memcpy(owned_data_.data(), data, byte_size);
27
+ data_ = owned_data_.data();
28
+ }
29
+ }
30
+
31
+ // --- Validity bitmap ---
32
+
33
+ void Column::ensure_validity() {
34
+ if (has_validity_) return;
35
+ size_t num_bytes = (size_ + 7) / 8;
36
+ validity_.assign(num_bytes, 0xFF); // All bits set = all valid
37
+ // Handle trailing bits in last byte
38
+ if (size_ % 8 != 0) {
39
+ uint8_t mask = static_cast<uint8_t>((1u << (size_ % 8)) - 1);
40
+ validity_.back() = mask;
41
+ }
42
+ has_validity_ = true;
43
+ }
44
+
45
+ bool Column::is_valid(size_t row) const {
46
+ if (!has_validity_) return true; // No bitmap = all valid
47
+ if (row >= size_) return false;
48
+ return (validity_[row / 8] >> (row % 8)) & 1;
49
+ }
50
+
51
+ void Column::set_valid(size_t row, bool valid) {
52
+ if (!has_validity_) ensure_validity();
53
+ if (row >= size_) return;
54
+ if (valid) {
55
+ validity_[row / 8] |= (1u << (row % 8));
56
+ } else {
57
+ validity_[row / 8] &= ~(1u << (row % 8));
58
+ }
59
+ }
60
+
61
+ size_t Column::count_valid() const {
62
+ if (!has_validity_) return size_; // All valid
63
+ size_t count = 0;
64
+ for (size_t i = 0; i < validity_.size(); ++i) {
65
+ count += std::popcount(validity_[i]);
66
+ }
67
+ return count;
68
+ }
69
+
70
+ void Column::set_validity_bitmap(std::vector<uint8_t> bitmap) {
71
+ validity_ = std::move(bitmap);
72
+ has_validity_ = !validity_.empty();
73
+ }
74
+
75
+ void Column::append(const void* value, size_t value_size) {
76
+ if (!owns_data_) {
77
+ throw InvalidOperation("Cannot append to non-owned column");
78
+ }
79
+ size_t elem_size = dtype_size(dtype_);
80
+ if (elem_size == 0) {
81
+ throw InvalidOperation("Cannot append to variable-length column via Column::append");
82
+ }
83
+ if (value_size != elem_size) {
84
+ throw InvalidOperation("Value size mismatch in append");
85
+ }
86
+
87
+ size_t old_byte_size = owned_data_.size();
88
+ owned_data_.resize(old_byte_size + elem_size);
89
+ std::memcpy(owned_data_.data() + old_byte_size, value, elem_size);
90
+ data_ = owned_data_.data();
91
+ ++size_;
92
+
93
+ // Extend validity bitmap if present
94
+ if (has_validity_) {
95
+ size_t needed_bytes = (size_ + 7) / 8;
96
+ if (validity_.size() < needed_bytes) {
97
+ validity_.push_back(0);
98
+ }
99
+ set_valid(size_ - 1, true);
100
+ }
101
+ }
102
+
103
+ void Column::set(size_t row, const void* value, size_t value_size) {
104
+ if (!owns_data_) {
105
+ throw InvalidOperation("Cannot set on non-owned column");
106
+ }
107
+ if (row >= size_) {
108
+ throw InvalidOperation("Row index out of range in set");
109
+ }
110
+ size_t elem_size = dtype_size(dtype_);
111
+ if (elem_size == 0) {
112
+ throw InvalidOperation("Cannot set on variable-length column via Column::set");
113
+ }
114
+ if (value_size != elem_size) {
115
+ throw InvalidOperation("Value size mismatch in set");
116
+ }
117
+
118
+ std::memcpy(owned_data_.data() + row * elem_size, value, elem_size);
119
+ }
120
+
121
+ } // namespace wayy_db
src/database.cpp ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/database.hpp"
2
+
3
+ #include <filesystem>
4
+ #include <mutex>
5
+
6
+ namespace fs = std::filesystem;
7
+
8
+ namespace wayy_db {
9
+
10
+ Database::Database() = default;
11
+
12
+ Database::Database(const std::string& path) : path_(path) {
13
+ if (!path_.empty()) {
14
+ fs::create_directories(path_);
15
+ scan_tables();
16
+
17
+ // Initialize WAL
18
+ wal_ = std::make_unique<WriteAheadLog>(path_);
19
+
20
+ // Replay any unprocessed WAL entries from a crash
21
+ if (wal_->has_entries()) {
22
+ wal_->replay(*this);
23
+ }
24
+ }
25
+ }
26
+
27
+ std::vector<std::string> Database::tables() const {
28
+ std::shared_lock lock(mutex_);
29
+ std::vector<std::string> names;
30
+ names.reserve(tables_.size());
31
+ for (const auto& [name, _] : tables_) {
32
+ names.push_back(name);
33
+ }
34
+ // Also include tables on disk that aren't loaded yet
35
+ for (const auto& [name, _] : loaded_) {
36
+ if (!tables_.count(name)) {
37
+ names.push_back(name);
38
+ }
39
+ }
40
+ return names;
41
+ }
42
+
43
+ bool Database::has_table(const std::string& name) const {
44
+ std::shared_lock lock(mutex_);
45
+ return tables_.count(name) > 0 || loaded_.count(name) > 0;
46
+ }
47
+
48
+ Table& Database::table(const std::string& name) {
49
+ // First try with shared lock (read-only)
50
+ {
51
+ std::shared_lock lock(mutex_);
52
+ auto it = tables_.find(name);
53
+ if (it != tables_.end()) {
54
+ return it->second;
55
+ }
56
+ }
57
+
58
+ // Need to lazy load - acquire exclusive lock
59
+ std::unique_lock lock(mutex_);
60
+
61
+ // Double-check after acquiring exclusive lock (another thread may have loaded it)
62
+ auto it = tables_.find(name);
63
+ if (it != tables_.end()) {
64
+ return it->second;
65
+ }
66
+
67
+ // Try to load from disk
68
+ if (is_persistent() && loaded_.count(name)) {
69
+ tables_.emplace(name, Table::mmap(table_path(name)));
70
+ return tables_.at(name);
71
+ }
72
+
73
+ throw WayyException("Table not found: " + name);
74
+ }
75
+
76
+ Table& Database::create_table(const std::string& name) {
77
+ std::unique_lock lock(mutex_);
78
+
79
+ if (tables_.count(name) > 0 || loaded_.count(name) > 0) {
80
+ throw InvalidOperation("Table already exists: " + name);
81
+ }
82
+
83
+ tables_.emplace(name, Table(name));
84
+ if (is_persistent()) {
85
+ loaded_[name] = true;
86
+ }
87
+ return tables_.at(name);
88
+ }
89
+
90
+ void Database::add_table(Table table) {
91
+ const std::string& name = table.name();
92
+
93
+ std::unique_lock lock(mutex_);
94
+
95
+ if (tables_.count(name) > 0 || loaded_.count(name) > 0) {
96
+ throw InvalidOperation("Table already exists: " + name);
97
+ }
98
+
99
+ if (is_persistent()) {
100
+ table.save(table_path(name));
101
+ loaded_[name] = true;
102
+ }
103
+ tables_.emplace(name, std::move(table));
104
+ }
105
+
106
+ void Database::drop_table(const std::string& name) {
107
+ std::unique_lock lock(mutex_);
108
+
109
+ tables_.erase(name);
110
+ loaded_.erase(name);
111
+
112
+ if (is_persistent()) {
113
+ fs::remove_all(table_path(name));
114
+ }
115
+ }
116
+
117
+ void Database::save() {
118
+ if (!is_persistent()) return;
119
+
120
+ std::shared_lock lock(mutex_);
121
+ for (auto& [name, table] : tables_) {
122
+ table.save(table_path(name));
123
+ }
124
+ }
125
+
126
+ void Database::refresh() {
127
+ if (!is_persistent()) return;
128
+
129
+ std::unique_lock lock(mutex_);
130
+ scan_tables();
131
+ }
132
+
133
+ void Database::checkpoint() {
134
+ if (!wal_) return;
135
+ wal_->checkpoint(*this);
136
+ }
137
+
138
+ std::string Database::table_path(const std::string& name) const {
139
+ return path_ + "/" + name;
140
+ }
141
+
142
+ void Database::scan_tables() {
143
+ if (!fs::exists(path_)) return;
144
+
145
+ for (const auto& entry : fs::directory_iterator(path_)) {
146
+ if (entry.is_directory()) {
147
+ std::string meta_path = entry.path().string() + "/_meta.json";
148
+ if (fs::exists(meta_path)) {
149
+ std::string name = entry.path().filename().string();
150
+ loaded_[name] = false; // Not loaded into memory yet
151
+ }
152
+ }
153
+ }
154
+ }
155
+
156
+ } // namespace wayy_db
src/hash_index.cpp ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/hash_index.hpp"
2
+ #include "wayy_db/table.hpp"
3
+ #include "wayy_db/column.hpp"
4
+ #include "wayy_db/string_column.hpp"
5
+
6
+ namespace wayy_db {
7
+
8
+ void HashIndex::build_int(const Table& table, const std::string& col_name) {
9
+ clear();
10
+ const Column& col = table.column(col_name);
11
+ auto view = col.as<const int64_t>();
12
+ for (size_t i = 0; i < view.size(); ++i) {
13
+ if (col.is_valid(i)) {
14
+ int_map_[view[i]] = i;
15
+ }
16
+ }
17
+ }
18
+
19
+ void HashIndex::build_str(const Table& table, const std::string& col_name) {
20
+ clear();
21
+ const StringColumn& col = table.string_column(col_name);
22
+ for (size_t i = 0; i < col.size(); ++i) {
23
+ if (col.is_valid(i)) {
24
+ str_map_[std::string(col.get(i))] = i;
25
+ }
26
+ }
27
+ }
28
+
29
+ std::optional<size_t> HashIndex::find_int(int64_t key) const {
30
+ auto it = int_map_.find(key);
31
+ if (it != int_map_.end()) return it->second;
32
+ return std::nullopt;
33
+ }
34
+
35
+ std::optional<size_t> HashIndex::find_str(std::string_view key) const {
36
+ auto it = str_map_.find(std::string(key));
37
+ if (it != str_map_.end()) return it->second;
38
+ return std::nullopt;
39
+ }
40
+
41
+ void HashIndex::insert_int(int64_t key, size_t row) {
42
+ int_map_[key] = row;
43
+ }
44
+
45
+ void HashIndex::insert_str(std::string_view key, size_t row) {
46
+ str_map_[std::string(key)] = row;
47
+ }
48
+
49
+ void HashIndex::remove_int(int64_t key) {
50
+ int_map_.erase(key);
51
+ }
52
+
53
+ void HashIndex::remove_str(std::string_view key) {
54
+ str_map_.erase(std::string(key));
55
+ }
56
+
57
+ void HashIndex::clear() {
58
+ int_map_.clear();
59
+ str_map_.clear();
60
+ }
61
+
62
+ } // namespace wayy_db
src/mmap_file.cpp ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/mmap_file.hpp"
2
+ #include "wayy_db/types.hpp"
3
+
4
+ #include <fcntl.h>
5
+ #include <sys/mman.h>
6
+ #include <sys/stat.h>
7
+ #include <unistd.h>
8
+
9
+ #include <cstring>
10
+
11
+ namespace wayy_db {
12
+
13
+ MmapFile::MmapFile(const std::string& path, Mode mode, size_t size) {
14
+ open(path, mode, size);
15
+ }
16
+
17
+ MmapFile::MmapFile(MmapFile&& other) noexcept
18
+ : path_(std::move(other.path_))
19
+ , data_(other.data_)
20
+ , size_(other.size_)
21
+ , mode_(other.mode_)
22
+ , fd_(other.fd_) {
23
+ other.data_ = nullptr;
24
+ other.size_ = 0;
25
+ other.fd_ = -1;
26
+ }
27
+
28
+ MmapFile& MmapFile::operator=(MmapFile&& other) noexcept {
29
+ if (this != &other) {
30
+ close();
31
+ path_ = std::move(other.path_);
32
+ data_ = other.data_;
33
+ size_ = other.size_;
34
+ mode_ = other.mode_;
35
+ fd_ = other.fd_;
36
+ other.data_ = nullptr;
37
+ other.size_ = 0;
38
+ other.fd_ = -1;
39
+ }
40
+ return *this;
41
+ }
42
+
43
+ MmapFile::~MmapFile() {
44
+ close();
45
+ }
46
+
47
+ void MmapFile::open(const std::string& path, Mode mode, size_t size) {
48
+ close();
49
+
50
+ path_ = path;
51
+ mode_ = mode;
52
+
53
+ int flags = 0;
54
+ int prot = 0;
55
+
56
+ switch (mode) {
57
+ case Mode::ReadOnly:
58
+ flags = O_RDONLY;
59
+ prot = PROT_READ;
60
+ break;
61
+ case Mode::ReadWrite:
62
+ flags = O_RDWR;
63
+ prot = PROT_READ | PROT_WRITE;
64
+ break;
65
+ case Mode::Create:
66
+ flags = O_RDWR | O_CREAT | O_TRUNC;
67
+ prot = PROT_READ | PROT_WRITE;
68
+ break;
69
+ }
70
+
71
+ fd_ = ::open(path.c_str(), flags, 0644);
72
+ if (fd_ < 0) {
73
+ throw WayyException("Failed to open file: " + path + " (" + strerror(errno) + ")");
74
+ }
75
+
76
+ if (mode == Mode::Create && size > 0) {
77
+ // Extend file to requested size
78
+ if (ftruncate(fd_, size) < 0) {
79
+ ::close(fd_);
80
+ fd_ = -1;
81
+ throw WayyException("Failed to resize file: " + path);
82
+ }
83
+ size_ = size;
84
+ } else {
85
+ // Get file size
86
+ struct stat st;
87
+ if (fstat(fd_, &st) < 0) {
88
+ ::close(fd_);
89
+ fd_ = -1;
90
+ throw WayyException("Failed to stat file: " + path);
91
+ }
92
+ size_ = st.st_size;
93
+ }
94
+
95
+ if (size_ == 0) {
96
+ // Can't mmap empty file
97
+ return;
98
+ }
99
+
100
+ data_ = mmap(nullptr, size_, prot, MAP_SHARED, fd_, 0);
101
+ if (data_ == MAP_FAILED) {
102
+ data_ = nullptr;
103
+ ::close(fd_);
104
+ fd_ = -1;
105
+ throw WayyException("Failed to mmap file: " + path + " (" + strerror(errno) + ")");
106
+ }
107
+ }
108
+
109
+ void MmapFile::close() {
110
+ if (data_ != nullptr) {
111
+ munmap(data_, size_);
112
+ data_ = nullptr;
113
+ }
114
+ if (fd_ >= 0) {
115
+ ::close(fd_);
116
+ fd_ = -1;
117
+ }
118
+ size_ = 0;
119
+ path_.clear();
120
+ }
121
+
122
+ void MmapFile::sync() {
123
+ if (data_ != nullptr && mode_ != Mode::ReadOnly) {
124
+ msync(data_, size_, MS_SYNC);
125
+ }
126
+ }
127
+
128
+ void MmapFile::resize(size_t new_size) {
129
+ if (mode_ != Mode::Create && mode_ != Mode::ReadWrite) {
130
+ throw InvalidOperation("Cannot resize read-only mmap");
131
+ }
132
+
133
+ if (data_ != nullptr) {
134
+ munmap(data_, size_);
135
+ data_ = nullptr;
136
+ }
137
+
138
+ if (ftruncate(fd_, new_size) < 0) {
139
+ throw WayyException("Failed to resize file: " + path_);
140
+ }
141
+
142
+ size_ = new_size;
143
+
144
+ if (size_ > 0) {
145
+ int prot = PROT_READ | PROT_WRITE;
146
+ data_ = mmap(nullptr, size_, prot, MAP_SHARED, fd_, 0);
147
+ if (data_ == MAP_FAILED) {
148
+ data_ = nullptr;
149
+ throw WayyException("Failed to remap file: " + path_);
150
+ }
151
+ }
152
+ }
153
+
154
+ } // namespace wayy_db
src/ops/aggregations.cpp ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/ops/aggregations.hpp"
2
+
3
+ #include <algorithm>
4
+ #include <cmath>
5
+ #include <numeric>
6
+
7
+ #ifdef WAYY_USE_AVX2
8
+ #include <immintrin.h>
9
+ #endif
10
+
11
+ namespace wayy_db::ops {
12
+
13
+ // Scalar implementations
14
+
15
+ template<typename T>
16
+ T sum(const ColumnView<T>& col) {
17
+ return std::accumulate(col.begin(), col.end(), T{0});
18
+ }
19
+
20
+ template int64_t sum(const ColumnView<int64_t>&);
21
+ template double sum(const ColumnView<double>&);
22
+
23
+ template<typename T>
24
+ T min(const ColumnView<T>& col) {
25
+ if (col.empty()) {
26
+ throw InvalidOperation("min() on empty column");
27
+ }
28
+ return *std::min_element(col.begin(), col.end());
29
+ }
30
+
31
+ template int64_t min(const ColumnView<int64_t>&);
32
+ template double min(const ColumnView<double>&);
33
+
34
+ template<typename T>
35
+ T max(const ColumnView<T>& col) {
36
+ if (col.empty()) {
37
+ throw InvalidOperation("max() on empty column");
38
+ }
39
+ return *std::max_element(col.begin(), col.end());
40
+ }
41
+
42
+ template int64_t max(const ColumnView<int64_t>&);
43
+ template double max(const ColumnView<double>&);
44
+
45
+ template<typename T>
46
+ double variance(const ColumnView<T>& col) {
47
+ if (col.empty()) {
48
+ return std::numeric_limits<double>::quiet_NaN();
49
+ }
50
+
51
+ double mean = avg(col);
52
+ double sum_sq = 0.0;
53
+
54
+ for (const auto& val : col) {
55
+ double diff = static_cast<double>(val) - mean;
56
+ sum_sq += diff * diff;
57
+ }
58
+
59
+ return sum_sq / static_cast<double>(col.size());
60
+ }
61
+
62
+ template double variance(const ColumnView<int64_t>&);
63
+ template double variance(const ColumnView<double>&);
64
+
65
+ template<typename T>
66
+ double std_dev(const ColumnView<T>& col) {
67
+ return std::sqrt(variance(col));
68
+ }
69
+
70
+ template double std_dev(const ColumnView<int64_t>&);
71
+ template double std_dev(const ColumnView<double>&);
72
+
73
+ // SIMD implementations
74
+
75
+ #ifdef WAYY_USE_AVX2
76
+
77
+ double sum_simd(const ColumnView<double>& col) {
78
+ const double* data = col.data();
79
+ size_t n = col.size();
80
+
81
+ __m256d vsum = _mm256_setzero_pd();
82
+
83
+ // Process 4 doubles per iteration
84
+ size_t i = 0;
85
+ for (; i + 4 <= n; i += 4) {
86
+ __m256d v = _mm256_loadu_pd(data + i);
87
+ vsum = _mm256_add_pd(vsum, v);
88
+ }
89
+
90
+ // Horizontal reduction
91
+ __m128d vlow = _mm256_castpd256_pd128(vsum);
92
+ __m128d vhigh = _mm256_extractf128_pd(vsum, 1);
93
+ vlow = _mm_add_pd(vlow, vhigh);
94
+ __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
95
+ double result = _mm_cvtsd_f64(_mm_add_sd(vlow, high64));
96
+
97
+ // Handle remainder
98
+ for (; i < n; ++i) {
99
+ result += data[i];
100
+ }
101
+
102
+ return result;
103
+ }
104
+
105
+ int64_t sum_simd(const ColumnView<int64_t>& col) {
106
+ const int64_t* data = col.data();
107
+ size_t n = col.size();
108
+
109
+ __m256i vsum = _mm256_setzero_si256();
110
+
111
+ // Process 4 int64s per iteration
112
+ size_t i = 0;
113
+ for (; i + 4 <= n; i += 4) {
114
+ __m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data + i));
115
+ vsum = _mm256_add_epi64(vsum, v);
116
+ }
117
+
118
+ // Horizontal reduction
119
+ alignas(32) int64_t temp[4];
120
+ _mm256_store_si256(reinterpret_cast<__m256i*>(temp), vsum);
121
+ int64_t result = temp[0] + temp[1] + temp[2] + temp[3];
122
+
123
+ // Handle remainder
124
+ for (; i < n; ++i) {
125
+ result += data[i];
126
+ }
127
+
128
+ return result;
129
+ }
130
+
131
+ #else
132
+
133
+ double sum_simd(const ColumnView<double>& col) {
134
+ return sum(col);
135
+ }
136
+
137
+ int64_t sum_simd(const ColumnView<int64_t>& col) {
138
+ return sum(col);
139
+ }
140
+
141
+ #endif
142
+
143
+ // Type-erased implementations
144
+
145
+ double sum(const Column& col) {
146
+ switch (col.dtype()) {
147
+ case DType::Int64:
148
+ case DType::Timestamp:
149
+ return static_cast<double>(sum_simd(const_cast<Column&>(col).as_int64()));
150
+ case DType::Float64:
151
+ return sum_simd(const_cast<Column&>(col).as_float64());
152
+ default:
153
+ throw InvalidOperation("sum() not supported for this type");
154
+ }
155
+ }
156
+
157
+ double avg(const Column& col) {
158
+ if (col.size() == 0) {
159
+ return std::numeric_limits<double>::quiet_NaN();
160
+ }
161
+ return sum(col) / static_cast<double>(col.size());
162
+ }
163
+
164
+ double min_val(const Column& col) {
165
+ switch (col.dtype()) {
166
+ case DType::Int64:
167
+ case DType::Timestamp:
168
+ return static_cast<double>(min(const_cast<Column&>(col).as_int64()));
169
+ case DType::Float64:
170
+ return min(const_cast<Column&>(col).as_float64());
171
+ default:
172
+ throw InvalidOperation("min() not supported for this type");
173
+ }
174
+ }
175
+
176
+ double max_val(const Column& col) {
177
+ switch (col.dtype()) {
178
+ case DType::Int64:
179
+ case DType::Timestamp:
180
+ return static_cast<double>(max(const_cast<Column&>(col).as_int64()));
181
+ case DType::Float64:
182
+ return max(const_cast<Column&>(col).as_float64());
183
+ default:
184
+ throw InvalidOperation("max() not supported for this type");
185
+ }
186
+ }
187
+
188
+ double std_dev(const Column& col) {
189
+ switch (col.dtype()) {
190
+ case DType::Int64:
191
+ case DType::Timestamp:
192
+ return std_dev(const_cast<Column&>(col).as_int64());
193
+ case DType::Float64:
194
+ return std_dev(const_cast<Column&>(col).as_float64());
195
+ default:
196
+ throw InvalidOperation("std_dev() not supported for this type");
197
+ }
198
+ }
199
+
200
+ } // namespace wayy_db::ops
src/ops/joins.cpp ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/ops/joins.hpp"
2
+
3
+ #include <algorithm>
4
+ #include <cstring>
5
+ #include <unordered_map>
6
+ #include <vector>
7
+
8
+ namespace wayy_db::ops {
9
+
10
+ namespace {
11
+
12
+ // Hash combine for multi-key joins
13
+ struct KeyHash {
14
+ size_t operator()(const std::vector<int64_t>& key) const {
15
+ size_t hash = 0;
16
+ for (auto val : key) {
17
+ hash ^= std::hash<int64_t>{}(val) + 0x9e3779b9 + (hash << 6) + (hash >> 2);
18
+ }
19
+ return hash;
20
+ }
21
+ };
22
+
23
+ // Extract join key values from a row
24
+ std::vector<int64_t> extract_key(const Table& table,
25
+ const std::vector<std::string>& on,
26
+ size_t row) {
27
+ std::vector<int64_t> key;
28
+ key.reserve(on.size());
29
+
30
+ for (const auto& col_name : on) {
31
+ const Column& col = table.column(col_name);
32
+ switch (col.dtype()) {
33
+ case DType::Int64:
34
+ case DType::Timestamp:
35
+ key.push_back(const_cast<Column&>(col).as_int64()[row]);
36
+ break;
37
+ case DType::Symbol:
38
+ key.push_back(const_cast<Column&>(col).as_symbol()[row]);
39
+ break;
40
+ default:
41
+ throw InvalidOperation("Join key column must be Int64, Timestamp, or Symbol");
42
+ }
43
+ }
44
+
45
+ return key;
46
+ }
47
+
48
+ // Group row indices by key values
49
+ std::unordered_map<std::vector<int64_t>, std::vector<size_t>, KeyHash>
50
+ group_by_key(const Table& table, const std::vector<std::string>& on) {
51
+ std::unordered_map<std::vector<int64_t>, std::vector<size_t>, KeyHash> groups;
52
+
53
+ for (size_t i = 0; i < table.num_rows(); ++i) {
54
+ auto key = extract_key(table, on, i);
55
+ groups[key].push_back(i);
56
+ }
57
+
58
+ return groups;
59
+ }
60
+
61
+ } // namespace
62
+
63
+ Table aj(const Table& left, const Table& right,
64
+ const std::vector<std::string>& on,
65
+ const std::string& as_of) {
66
+
67
+ // Validate inputs
68
+ if (!left.is_sorted() || left.sorted_by() != as_of) {
69
+ throw InvalidOperation("Left table must be sorted by " + as_of);
70
+ }
71
+ if (!right.is_sorted() || right.sorted_by() != as_of) {
72
+ throw InvalidOperation("Right table must be sorted by " + as_of);
73
+ }
74
+
75
+ // Group right table by join keys
76
+ auto right_groups = group_by_key(right, on);
77
+
78
+ // Get timestamp columns
79
+ auto left_ts = const_cast<Table&>(left).column(as_of).as_int64();
80
+ auto right_ts = const_cast<Table&>(right).column(as_of).as_int64();
81
+
82
+ // Result builders - collect matching indices
83
+ std::vector<size_t> left_indices;
84
+ std::vector<size_t> right_indices; // -1 means no match
85
+ left_indices.reserve(left.num_rows());
86
+ right_indices.reserve(left.num_rows());
87
+
88
+ // For each left row, find the most recent right row
89
+ for (size_t i = 0; i < left.num_rows(); ++i) {
90
+ auto key = extract_key(left, on, i);
91
+ int64_t ts = left_ts[i];
92
+
93
+ auto group_it = right_groups.find(key);
94
+ if (group_it == right_groups.end()) {
95
+ // No matching key in right table
96
+ left_indices.push_back(i);
97
+ right_indices.push_back(static_cast<size_t>(-1));
98
+ continue;
99
+ }
100
+
101
+ const auto& group = group_it->second;
102
+
103
+ // Binary search for largest timestamp <= ts
104
+ auto it = std::upper_bound(group.begin(), group.end(), ts,
105
+ [&right_ts](int64_t t, size_t idx) { return t < right_ts[idx]; });
106
+
107
+ if (it != group.begin()) {
108
+ --it;
109
+ left_indices.push_back(i);
110
+ right_indices.push_back(*it);
111
+ } else {
112
+ // No timestamp <= ts
113
+ left_indices.push_back(i);
114
+ right_indices.push_back(static_cast<size_t>(-1));
115
+ }
116
+ }
117
+
118
+ // Build result table
119
+ Table result("aj_result");
120
+
121
+ // Add left columns
122
+ for (const auto& col_name : left.column_names()) {
123
+ const Column& src = left.column(col_name);
124
+ size_t elem_size = dtype_size(src.dtype());
125
+ std::vector<uint8_t> data(left_indices.size() * elem_size);
126
+
127
+ const uint8_t* src_data = static_cast<const uint8_t*>(src.data());
128
+ for (size_t i = 0; i < left_indices.size(); ++i) {
129
+ std::memcpy(data.data() + i * elem_size,
130
+ src_data + left_indices[i] * elem_size,
131
+ elem_size);
132
+ }
133
+
134
+ result.add_column(Column(col_name, src.dtype(), std::move(data)));
135
+ }
136
+
137
+ // Add right columns (excluding join keys and as_of)
138
+ for (const auto& col_name : right.column_names()) {
139
+ // Skip if already in left or is a join key
140
+ if (result.has_column(col_name)) continue;
141
+ if (std::find(on.begin(), on.end(), col_name) != on.end()) continue;
142
+
143
+ const Column& src = right.column(col_name);
144
+ size_t elem_size = dtype_size(src.dtype());
145
+ std::vector<uint8_t> data(right_indices.size() * elem_size, 0);
146
+
147
+ const uint8_t* src_data = static_cast<const uint8_t*>(src.data());
148
+ for (size_t i = 0; i < right_indices.size(); ++i) {
149
+ if (right_indices[i] != static_cast<size_t>(-1)) {
150
+ std::memcpy(data.data() + i * elem_size,
151
+ src_data + right_indices[i] * elem_size,
152
+ elem_size);
153
+ }
154
+ // else: leave as zero (null representation)
155
+ }
156
+
157
+ result.add_column(Column(col_name, src.dtype(), std::move(data)));
158
+ }
159
+
160
+ result.set_sorted_by(as_of);
161
+ return result;
162
+ }
163
+
164
+ Table wj(const Table& left, const Table& right,
165
+ const std::vector<std::string>& on,
166
+ const std::string& as_of,
167
+ int64_t window_before,
168
+ int64_t window_after) {
169
+
170
+ // Validate inputs
171
+ if (!left.is_sorted() || left.sorted_by() != as_of) {
172
+ throw InvalidOperation("Left table must be sorted by " + as_of);
173
+ }
174
+ if (!right.is_sorted() || right.sorted_by() != as_of) {
175
+ throw InvalidOperation("Right table must be sorted by " + as_of);
176
+ }
177
+
178
+ // Group right table by join keys
179
+ auto right_groups = group_by_key(right, on);
180
+
181
+ // Get timestamp columns
182
+ auto left_ts = const_cast<Table&>(left).column(as_of).as_int64();
183
+ auto right_ts = const_cast<Table&>(right).column(as_of).as_int64();
184
+
185
+ // Result builders
186
+ std::vector<size_t> left_indices;
187
+ std::vector<size_t> right_indices;
188
+
189
+ // For each left row, find all right rows in window
190
+ for (size_t i = 0; i < left.num_rows(); ++i) {
191
+ auto key = extract_key(left, on, i);
192
+ int64_t ts = left_ts[i];
193
+ int64_t ts_min = ts - window_before;
194
+ int64_t ts_max = ts + window_after;
195
+
196
+ auto group_it = right_groups.find(key);
197
+ if (group_it == right_groups.end()) {
198
+ continue; // No matching key
199
+ }
200
+
201
+ const auto& group = group_it->second;
202
+
203
+ // Find range [ts_min, ts_max]
204
+ auto lower = std::lower_bound(group.begin(), group.end(), ts_min,
205
+ [&right_ts](size_t idx, int64_t t) { return right_ts[idx] < t; });
206
+ auto upper = std::upper_bound(group.begin(), group.end(), ts_max,
207
+ [&right_ts](int64_t t, size_t idx) { return t < right_ts[idx]; });
208
+
209
+ for (auto it = lower; it != upper; ++it) {
210
+ left_indices.push_back(i);
211
+ right_indices.push_back(*it);
212
+ }
213
+ }
214
+
215
+ // Build result table (similar to aj)
216
+ Table result("wj_result");
217
+
218
+ // Add left columns
219
+ for (const auto& col_name : left.column_names()) {
220
+ const Column& src = left.column(col_name);
221
+ size_t elem_size = dtype_size(src.dtype());
222
+ std::vector<uint8_t> data(left_indices.size() * elem_size);
223
+
224
+ const uint8_t* src_data = static_cast<const uint8_t*>(src.data());
225
+ for (size_t i = 0; i < left_indices.size(); ++i) {
226
+ std::memcpy(data.data() + i * elem_size,
227
+ src_data + left_indices[i] * elem_size,
228
+ elem_size);
229
+ }
230
+
231
+ result.add_column(Column(col_name, src.dtype(), std::move(data)));
232
+ }
233
+
234
+ // Add right columns (excluding join keys)
235
+ for (const auto& col_name : right.column_names()) {
236
+ if (result.has_column(col_name)) continue;
237
+ if (std::find(on.begin(), on.end(), col_name) != on.end()) continue;
238
+
239
+ const Column& src = right.column(col_name);
240
+ size_t elem_size = dtype_size(src.dtype());
241
+ std::vector<uint8_t> data(right_indices.size() * elem_size);
242
+
243
+ const uint8_t* src_data = static_cast<const uint8_t*>(src.data());
244
+ for (size_t i = 0; i < right_indices.size(); ++i) {
245
+ std::memcpy(data.data() + i * elem_size,
246
+ src_data + right_indices[i] * elem_size,
247
+ elem_size);
248
+ }
249
+
250
+ result.add_column(Column(col_name, src.dtype(), std::move(data)));
251
+ }
252
+
253
+ if (!result.column_names().empty()) {
254
+ result.set_sorted_by(as_of);
255
+ }
256
+ return result;
257
+ }
258
+
259
+ Table inner_join(const Table& left, const Table& right,
260
+ const std::vector<std::string>& on) {
261
+ // TODO: Implement inner join
262
+ throw InvalidOperation("inner_join not yet implemented");
263
+ }
264
+
265
+ Table left_join(const Table& left, const Table& right,
266
+ const std::vector<std::string>& on) {
267
+ // TODO: Implement left join
268
+ throw InvalidOperation("left_join not yet implemented");
269
+ }
270
+
271
+ } // namespace wayy_db::ops
src/ops/window.cpp ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/ops/window.hpp"
2
+
3
+ #include <deque>
4
+ #include <cmath>
5
+ #include <numeric>
6
+
7
+ namespace wayy_db::ops {
8
+
9
+ // Moving average
10
+
11
+ std::vector<double> mavg(const ColumnView<double>& col, size_t window) {
12
+ if (col.empty() || window == 0) return {};
13
+
14
+ std::vector<double> result(col.size());
15
+ double sum = 0.0;
16
+
17
+ for (size_t i = 0; i < col.size(); ++i) {
18
+ sum += col[i];
19
+ if (i >= window) {
20
+ sum -= col[i - window];
21
+ result[i] = sum / static_cast<double>(window);
22
+ } else {
23
+ result[i] = sum / static_cast<double>(i + 1);
24
+ }
25
+ }
26
+
27
+ return result;
28
+ }
29
+
30
+ std::vector<double> mavg(const ColumnView<int64_t>& col, size_t window) {
31
+ if (col.empty() || window == 0) return {};
32
+
33
+ std::vector<double> result(col.size());
34
+ int64_t sum = 0;
35
+
36
+ for (size_t i = 0; i < col.size(); ++i) {
37
+ sum += col[i];
38
+ if (i >= window) {
39
+ sum -= col[i - window];
40
+ result[i] = static_cast<double>(sum) / static_cast<double>(window);
41
+ } else {
42
+ result[i] = static_cast<double>(sum) / static_cast<double>(i + 1);
43
+ }
44
+ }
45
+
46
+ return result;
47
+ }
48
+
49
+ // Moving sum
50
+
51
+ std::vector<double> msum(const ColumnView<double>& col, size_t window) {
52
+ if (col.empty() || window == 0) return {};
53
+
54
+ std::vector<double> result(col.size());
55
+ double sum = 0.0;
56
+
57
+ for (size_t i = 0; i < col.size(); ++i) {
58
+ sum += col[i];
59
+ if (i >= window) {
60
+ sum -= col[i - window];
61
+ }
62
+ result[i] = sum;
63
+ }
64
+
65
+ return result;
66
+ }
67
+
68
+ std::vector<int64_t> msum(const ColumnView<int64_t>& col, size_t window) {
69
+ if (col.empty() || window == 0) return {};
70
+
71
+ std::vector<int64_t> result(col.size());
72
+ int64_t sum = 0;
73
+
74
+ for (size_t i = 0; i < col.size(); ++i) {
75
+ sum += col[i];
76
+ if (i >= window) {
77
+ sum -= col[i - window];
78
+ }
79
+ result[i] = sum;
80
+ }
81
+
82
+ return result;
83
+ }
84
+
85
+ // Moving standard deviation (Welford's online algorithm)
86
+
87
+ std::vector<double> mstd(const ColumnView<double>& col, size_t window) {
88
+ if (col.empty() || window == 0) return {};
89
+
90
+ std::vector<double> result(col.size());
91
+
92
+ for (size_t i = 0; i < col.size(); ++i) {
93
+ size_t start = (i >= window) ? i - window + 1 : 0;
94
+ size_t count = i - start + 1;
95
+
96
+ double mean = 0.0;
97
+ double m2 = 0.0;
98
+ size_t n = 0;
99
+
100
+ for (size_t j = start; j <= i; ++j) {
101
+ ++n;
102
+ double delta = col[j] - mean;
103
+ mean += delta / static_cast<double>(n);
104
+ double delta2 = col[j] - mean;
105
+ m2 += delta * delta2;
106
+ }
107
+
108
+ result[i] = (n > 1) ? std::sqrt(m2 / static_cast<double>(n)) : 0.0;
109
+ }
110
+
111
+ return result;
112
+ }
113
+
114
+ std::vector<double> mstd(const ColumnView<int64_t>& col, size_t window) {
115
+ if (col.empty() || window == 0) return {};
116
+
117
+ std::vector<double> result(col.size());
118
+
119
+ for (size_t i = 0; i < col.size(); ++i) {
120
+ size_t start = (i >= window) ? i - window + 1 : 0;
121
+
122
+ double mean = 0.0;
123
+ double m2 = 0.0;
124
+ size_t n = 0;
125
+
126
+ for (size_t j = start; j <= i; ++j) {
127
+ ++n;
128
+ double val = static_cast<double>(col[j]);
129
+ double delta = val - mean;
130
+ mean += delta / static_cast<double>(n);
131
+ double delta2 = val - mean;
132
+ m2 += delta * delta2;
133
+ }
134
+
135
+ result[i] = (n > 1) ? std::sqrt(m2 / static_cast<double>(n)) : 0.0;
136
+ }
137
+
138
+ return result;
139
+ }
140
+
141
+ // Moving min/max using monotonic deque for O(n) complexity
142
+
143
+ template<typename T, typename Compare>
144
+ std::vector<T> monotonic_window(const ColumnView<T>& col, size_t window, Compare cmp) {
145
+ if (col.empty() || window == 0) return {};
146
+
147
+ std::vector<T> result(col.size());
148
+ std::deque<size_t> dq; // Indices
149
+
150
+ for (size_t i = 0; i < col.size(); ++i) {
151
+ // Remove elements outside window
152
+ while (!dq.empty() && dq.front() + window <= i) {
153
+ dq.pop_front();
154
+ }
155
+
156
+ // Remove elements that won't be min/max
157
+ while (!dq.empty() && cmp(col[i], col[dq.back()])) {
158
+ dq.pop_back();
159
+ }
160
+
161
+ dq.push_back(i);
162
+ result[i] = col[dq.front()];
163
+ }
164
+
165
+ return result;
166
+ }
167
+
168
+ std::vector<double> mmin(const ColumnView<double>& col, size_t window) {
169
+ return monotonic_window(col, window, std::less<double>{});
170
+ }
171
+
172
+ std::vector<int64_t> mmin(const ColumnView<int64_t>& col, size_t window) {
173
+ return monotonic_window(col, window, std::less<int64_t>{});
174
+ }
175
+
176
+ std::vector<double> mmax(const ColumnView<double>& col, size_t window) {
177
+ return monotonic_window(col, window, std::greater<double>{});
178
+ }
179
+
180
+ std::vector<int64_t> mmax(const ColumnView<int64_t>& col, size_t window) {
181
+ return monotonic_window(col, window, std::greater<int64_t>{});
182
+ }
183
+
184
+ // Exponential moving average
185
+
186
+ std::vector<double> ema(const ColumnView<double>& col, double alpha) {
187
+ if (col.empty()) return {};
188
+ if (alpha <= 0.0 || alpha > 1.0) {
189
+ throw std::invalid_argument("EMA alpha must be in (0, 1]");
190
+ }
191
+
192
+ std::vector<double> result(col.size());
193
+ result[0] = col[0];
194
+
195
+ for (size_t i = 1; i < col.size(); ++i) {
196
+ result[i] = alpha * col[i] + (1.0 - alpha) * result[i - 1];
197
+ }
198
+
199
+ return result;
200
+ }
201
+
202
+ std::vector<double> ema(const ColumnView<int64_t>& col, double alpha) {
203
+ if (col.empty()) return {};
204
+ if (alpha <= 0.0 || alpha > 1.0) {
205
+ throw std::invalid_argument("EMA alpha must be in (0, 1]");
206
+ }
207
+
208
+ std::vector<double> result(col.size());
209
+ result[0] = static_cast<double>(col[0]);
210
+
211
+ for (size_t i = 1; i < col.size(); ++i) {
212
+ result[i] = alpha * static_cast<double>(col[i]) + (1.0 - alpha) * result[i - 1];
213
+ }
214
+
215
+ return result;
216
+ }
217
+
218
+ std::vector<double> ema_span(const ColumnView<double>& col, size_t span) {
219
+ double alpha = 2.0 / (static_cast<double>(span) + 1.0);
220
+ return ema(col, alpha);
221
+ }
222
+
223
+ // Diff
224
+
225
+ std::vector<double> diff(const ColumnView<double>& col, size_t periods) {
226
+ if (col.empty() || periods >= col.size()) return std::vector<double>(col.size(), 0.0);
227
+
228
+ std::vector<double> result(col.size());
229
+ for (size_t i = 0; i < periods; ++i) {
230
+ result[i] = std::numeric_limits<double>::quiet_NaN();
231
+ }
232
+ for (size_t i = periods; i < col.size(); ++i) {
233
+ result[i] = col[i] - col[i - periods];
234
+ }
235
+
236
+ return result;
237
+ }
238
+
239
+ std::vector<int64_t> diff(const ColumnView<int64_t>& col, size_t periods) {
240
+ if (col.empty() || periods >= col.size()) return std::vector<int64_t>(col.size(), 0);
241
+
242
+ std::vector<int64_t> result(col.size(), 0);
243
+ for (size_t i = periods; i < col.size(); ++i) {
244
+ result[i] = col[i] - col[i - periods];
245
+ }
246
+
247
+ return result;
248
+ }
249
+
250
+ // Percent change
251
+
252
+ std::vector<double> pct_change(const ColumnView<double>& col, size_t periods) {
253
+ if (col.empty() || periods >= col.size()) {
254
+ return std::vector<double>(col.size(), std::numeric_limits<double>::quiet_NaN());
255
+ }
256
+
257
+ std::vector<double> result(col.size());
258
+ for (size_t i = 0; i < periods; ++i) {
259
+ result[i] = std::numeric_limits<double>::quiet_NaN();
260
+ }
261
+ for (size_t i = periods; i < col.size(); ++i) {
262
+ if (col[i - periods] != 0.0) {
263
+ result[i] = (col[i] - col[i - periods]) / col[i - periods];
264
+ } else {
265
+ result[i] = std::numeric_limits<double>::quiet_NaN();
266
+ }
267
+ }
268
+
269
+ return result;
270
+ }
271
+
272
+ // Shift
273
+
274
+ std::vector<double> shift(const ColumnView<double>& col, int64_t n) {
275
+ if (col.empty()) return {};
276
+
277
+ std::vector<double> result(col.size(), std::numeric_limits<double>::quiet_NaN());
278
+
279
+ if (n >= 0) {
280
+ size_t offset = static_cast<size_t>(n);
281
+ for (size_t i = offset; i < col.size(); ++i) {
282
+ result[i] = col[i - offset];
283
+ }
284
+ } else {
285
+ size_t offset = static_cast<size_t>(-n);
286
+ for (size_t i = 0; i + offset < col.size(); ++i) {
287
+ result[i] = col[i + offset];
288
+ }
289
+ }
290
+
291
+ return result;
292
+ }
293
+
294
+ std::vector<int64_t> shift(const ColumnView<int64_t>& col, int64_t n) {
295
+ if (col.empty()) return {};
296
+
297
+ std::vector<int64_t> result(col.size(), 0);
298
+
299
+ if (n >= 0) {
300
+ size_t offset = static_cast<size_t>(n);
301
+ for (size_t i = offset; i < col.size(); ++i) {
302
+ result[i] = col[i - offset];
303
+ }
304
+ } else {
305
+ size_t offset = static_cast<size_t>(-n);
306
+ for (size_t i = 0; i + offset < col.size(); ++i) {
307
+ result[i] = col[i + offset];
308
+ }
309
+ }
310
+
311
+ return result;
312
+ }
313
+
314
+ } // namespace wayy_db::ops
src/string_column.cpp ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/string_column.hpp"
2
+
3
+ #include <bit>
4
+ #include <cstring>
5
+ #include <filesystem>
6
+ #include <fstream>
7
+ #include <stdexcept>
8
+
9
+ namespace fs = std::filesystem;
10
+
11
+ namespace wayy_db {
12
+
13
+ StringColumn::StringColumn(std::string name) : name_(std::move(name)) {
14
+ offsets_.push_back(0); // Initial offset
15
+ }
16
+
17
+ std::string_view StringColumn::get(size_t row) const {
18
+ if (row >= size()) {
19
+ throw InvalidOperation("StringColumn row out of range");
20
+ }
21
+ if (has_validity_ && !is_valid(row)) {
22
+ return {}; // Null row returns empty view
23
+ }
24
+ int64_t start = offsets_[row];
25
+ int64_t end = offsets_[row + 1];
26
+ return std::string_view(reinterpret_cast<const char*>(data_.data() + start),
27
+ static_cast<size_t>(end - start));
28
+ }
29
+
30
+ void StringColumn::append(std::string_view val) {
31
+ int64_t offset = offsets_.back();
32
+ data_.insert(data_.end(), val.begin(), val.end());
33
+ offsets_.push_back(offset + static_cast<int64_t>(val.size()));
34
+
35
+ if (has_validity_) {
36
+ size_t row = size() - 1;
37
+ size_t needed_bytes = (size() + 7) / 8;
38
+ if (validity_.size() < needed_bytes) {
39
+ validity_.push_back(0);
40
+ }
41
+ set_valid(row, true);
42
+ }
43
+ }
44
+
45
+ void StringColumn::append_null() {
46
+ offsets_.push_back(offsets_.back()); // Zero-length entry
47
+ ensure_validity();
48
+ set_valid(size() - 1, false);
49
+ }
50
+
51
+ void StringColumn::set(size_t row, std::string_view val) {
52
+ if (row >= size()) {
53
+ throw InvalidOperation("StringColumn row out of range in set");
54
+ }
55
+ int64_t old_start = offsets_[row];
56
+ int64_t old_end = offsets_[row + 1];
57
+ int64_t old_len = old_end - old_start;
58
+ int64_t new_len = static_cast<int64_t>(val.size());
59
+
60
+ if (new_len <= old_len) {
61
+ // Fits in-place: overwrite and zero-pad remainder
62
+ std::memcpy(data_.data() + old_start, val.data(), val.size());
63
+ if (new_len < old_len) {
64
+ std::memset(data_.data() + old_start + new_len, 0,
65
+ static_cast<size_t>(old_len - new_len));
66
+ }
67
+ // Update offsets: shift this entry's end
68
+ offsets_[row + 1] = old_start + new_len;
69
+ // NOTE: This changes the offset for subsequent rows if they shared
70
+ // contiguous data. For OLTP use (row-level updates), this is fine
71
+ // because compact() will fix fragmentation.
72
+ } else {
73
+ // Doesn't fit: append to end of data buffer, old slot becomes waste
74
+ int64_t new_start = static_cast<int64_t>(data_.size());
75
+ data_.insert(data_.end(), val.begin(), val.end());
76
+ offsets_[row] = new_start;
77
+ offsets_[row + 1] = new_start + new_len;
78
+ }
79
+
80
+ if (has_validity_) {
81
+ set_valid(row, true);
82
+ }
83
+ }
84
+
85
+ // --- Validity bitmap ---
86
+
87
+ void StringColumn::ensure_validity() {
88
+ if (has_validity_) return;
89
+ size_t n = size();
90
+ size_t num_bytes = (n + 7) / 8;
91
+ validity_.assign(num_bytes, 0xFF);
92
+ if (n % 8 != 0) {
93
+ uint8_t mask = static_cast<uint8_t>((1u << (n % 8)) - 1);
94
+ validity_.back() = mask;
95
+ }
96
+ has_validity_ = true;
97
+ }
98
+
99
+ bool StringColumn::is_valid(size_t row) const {
100
+ if (!has_validity_) return true;
101
+ if (row >= size()) return false;
102
+ return (validity_[row / 8] >> (row % 8)) & 1;
103
+ }
104
+
105
+ void StringColumn::set_valid(size_t row, bool valid) {
106
+ if (!has_validity_) ensure_validity();
107
+ if (row >= size()) return;
108
+ if (valid) {
109
+ validity_[row / 8] |= (1u << (row % 8));
110
+ } else {
111
+ validity_[row / 8] &= ~(1u << (row % 8));
112
+ }
113
+ }
114
+
115
+ size_t StringColumn::count_valid() const {
116
+ if (!has_validity_) return size();
117
+ size_t count = 0;
118
+ for (auto byte : validity_) {
119
+ count += std::popcount(byte);
120
+ }
121
+ return count;
122
+ }
123
+
124
+ // --- Persistence ---
125
+ // Files: <dir>/<col_name>.offsets, <col_name>.data, <col_name>.validity
126
+
127
+ void StringColumn::save(const std::string& dir_path, const std::string& col_name) const {
128
+ fs::create_directories(dir_path);
129
+
130
+ // Write offsets
131
+ {
132
+ std::string path = dir_path + "/" + col_name + ".offsets";
133
+ std::ofstream f(path, std::ios::binary);
134
+ if (!f) throw WayyException("Failed to create offsets file: " + path);
135
+ uint64_t count = offsets_.size();
136
+ f.write(reinterpret_cast<const char*>(&count), sizeof(count));
137
+ f.write(reinterpret_cast<const char*>(offsets_.data()),
138
+ static_cast<std::streamsize>(offsets_.size() * sizeof(int64_t)));
139
+ }
140
+
141
+ // Write data
142
+ {
143
+ std::string path = dir_path + "/" + col_name + ".data";
144
+ std::ofstream f(path, std::ios::binary);
145
+ if (!f) throw WayyException("Failed to create data file: " + path);
146
+ uint64_t sz = data_.size();
147
+ f.write(reinterpret_cast<const char*>(&sz), sizeof(sz));
148
+ f.write(reinterpret_cast<const char*>(data_.data()),
149
+ static_cast<std::streamsize>(data_.size()));
150
+ }
151
+
152
+ // Write validity if present
153
+ if (has_validity_) {
154
+ std::string path = dir_path + "/" + col_name + ".validity";
155
+ std::ofstream f(path, std::ios::binary);
156
+ if (!f) throw WayyException("Failed to create validity file: " + path);
157
+ uint64_t sz = validity_.size();
158
+ f.write(reinterpret_cast<const char*>(&sz), sizeof(sz));
159
+ f.write(reinterpret_cast<const char*>(validity_.data()),
160
+ static_cast<std::streamsize>(validity_.size()));
161
+ }
162
+ }
163
+
164
+ StringColumn StringColumn::load(const std::string& dir_path, const std::string& col_name) {
165
+ StringColumn sc(col_name);
166
+ sc.offsets_.clear();
167
+
168
+ // Read offsets
169
+ {
170
+ std::string path = dir_path + "/" + col_name + ".offsets";
171
+ std::ifstream f(path, std::ios::binary);
172
+ if (!f) throw WayyException("Failed to open offsets file: " + path);
173
+ uint64_t count = 0;
174
+ f.read(reinterpret_cast<char*>(&count), sizeof(count));
175
+ sc.offsets_.resize(count);
176
+ f.read(reinterpret_cast<char*>(sc.offsets_.data()),
177
+ static_cast<std::streamsize>(count * sizeof(int64_t)));
178
+ }
179
+
180
+ // Read data
181
+ {
182
+ std::string path = dir_path + "/" + col_name + ".data";
183
+ std::ifstream f(path, std::ios::binary);
184
+ if (!f) throw WayyException("Failed to open data file: " + path);
185
+ uint64_t sz = 0;
186
+ f.read(reinterpret_cast<char*>(&sz), sizeof(sz));
187
+ sc.data_.resize(sz);
188
+ f.read(reinterpret_cast<char*>(sc.data_.data()),
189
+ static_cast<std::streamsize>(sz));
190
+ }
191
+
192
+ // Read validity if present
193
+ {
194
+ std::string path = dir_path + "/" + col_name + ".validity";
195
+ if (fs::exists(path)) {
196
+ std::ifstream f(path, std::ios::binary);
197
+ if (f) {
198
+ uint64_t sz = 0;
199
+ f.read(reinterpret_cast<char*>(&sz), sizeof(sz));
200
+ sc.validity_.resize(sz);
201
+ f.read(reinterpret_cast<char*>(sc.validity_.data()),
202
+ static_cast<std::streamsize>(sz));
203
+ sc.has_validity_ = true;
204
+ }
205
+ }
206
+ }
207
+
208
+ return sc;
209
+ }
210
+
211
+ std::vector<std::string> StringColumn::to_vector() const {
212
+ std::vector<std::string> result;
213
+ result.reserve(size());
214
+ for (size_t i = 0; i < size(); ++i) {
215
+ if (is_valid(i)) {
216
+ result.emplace_back(get(i));
217
+ } else {
218
+ result.emplace_back();
219
+ }
220
+ }
221
+ return result;
222
+ }
223
+
224
+ } // namespace wayy_db
src/table.cpp ADDED
@@ -0,0 +1,778 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/table.hpp"
2
+ #include "wayy_db/hash_index.hpp"
3
+
4
+ #include <algorithm>
5
+ #include <any>
6
+ #include <cstring>
7
+ #include <filesystem>
8
+ #include <fstream>
9
+ #include <sstream>
10
+
11
+ namespace fs = std::filesystem;
12
+
13
+ namespace wayy_db {
14
+
15
+ Table::Table(std::string name) : name_(std::move(name)) {}
16
+
17
+ Table::~Table() = default;
18
+
19
+ Table::Table(Table&& other) noexcept
20
+ : name_(std::move(other.name_)),
21
+ num_rows_(other.num_rows_),
22
+ columns_(std::move(other.columns_)),
23
+ column_index_(std::move(other.column_index_)),
24
+ sorted_by_(std::move(other.sorted_by_)),
25
+ string_columns_(std::move(other.string_columns_)),
26
+ string_column_index_(std::move(other.string_column_index_)),
27
+ primary_key_(std::move(other.primary_key_)),
28
+ pk_index_(std::move(other.pk_index_)),
29
+ mmap_files_(std::move(other.mmap_files_)) {
30
+ other.num_rows_ = 0;
31
+ }
32
+
33
+ Table& Table::operator=(Table&& other) noexcept {
34
+ if (this != &other) {
35
+ name_ = std::move(other.name_);
36
+ num_rows_ = other.num_rows_;
37
+ columns_ = std::move(other.columns_);
38
+ column_index_ = std::move(other.column_index_);
39
+ sorted_by_ = std::move(other.sorted_by_);
40
+ string_columns_ = std::move(other.string_columns_);
41
+ string_column_index_ = std::move(other.string_column_index_);
42
+ primary_key_ = std::move(other.primary_key_);
43
+ pk_index_ = std::move(other.pk_index_);
44
+ mmap_files_ = std::move(other.mmap_files_);
45
+ other.num_rows_ = 0;
46
+ }
47
+ return *this;
48
+ }
49
+
50
+ // --- Fixed-width column management ---
51
+
52
+ void Table::add_column(Column column) {
53
+ if (columns_.empty() && string_columns_.empty()) {
54
+ num_rows_ = column.size();
55
+ } else if (column.size() != num_rows_) {
56
+ throw InvalidOperation(
57
+ "Column size mismatch: expected " + std::to_string(num_rows_) +
58
+ ", got " + std::to_string(column.size()));
59
+ }
60
+
61
+ const std::string& col_name = column.name();
62
+ if (column_index_.count(col_name) || string_column_index_.count(col_name)) {
63
+ throw InvalidOperation("Column already exists: " + col_name);
64
+ }
65
+
66
+ column_index_[col_name] = columns_.size();
67
+ columns_.push_back(std::move(column));
68
+ }
69
+
70
+ void Table::add_column(const std::string& name, DType dtype, void* data, size_t size) {
71
+ add_column(Column(name, dtype, data, size, true));
72
+ }
73
+
74
+ // --- String column management ---
75
+
76
+ void Table::add_string_column(StringColumn col) {
77
+ if (columns_.empty() && string_columns_.empty()) {
78
+ num_rows_ = col.size();
79
+ } else if (col.size() != num_rows_) {
80
+ throw InvalidOperation(
81
+ "StringColumn size mismatch: expected " + std::to_string(num_rows_) +
82
+ ", got " + std::to_string(col.size()));
83
+ }
84
+
85
+ const std::string& col_name = col.name();
86
+ if (column_index_.count(col_name) || string_column_index_.count(col_name)) {
87
+ throw InvalidOperation("Column already exists: " + col_name);
88
+ }
89
+
90
+ string_column_index_[col_name] = string_columns_.size();
91
+ string_columns_.push_back(std::move(col));
92
+ }
93
+
94
+ bool Table::has_string_column(const std::string& name) const {
95
+ return string_column_index_.count(name) > 0;
96
+ }
97
+
98
+ StringColumn& Table::string_column(const std::string& name) {
99
+ auto it = string_column_index_.find(name);
100
+ if (it == string_column_index_.end()) {
101
+ throw ColumnNotFound(name);
102
+ }
103
+ return string_columns_[it->second];
104
+ }
105
+
106
+ const StringColumn& Table::string_column(const std::string& name) const {
107
+ auto it = string_column_index_.find(name);
108
+ if (it == string_column_index_.end()) {
109
+ throw ColumnNotFound(name);
110
+ }
111
+ return string_columns_[it->second];
112
+ }
113
+
114
+ // --- General column queries ---
115
+
116
+ bool Table::has_column(const std::string& name) const {
117
+ return column_index_.count(name) > 0 || string_column_index_.count(name) > 0;
118
+ }
119
+
120
+ Column& Table::column(const std::string& name) {
121
+ auto it = column_index_.find(name);
122
+ if (it == column_index_.end()) {
123
+ throw ColumnNotFound(name);
124
+ }
125
+ return columns_[it->second];
126
+ }
127
+
128
+ const Column& Table::column(const std::string& name) const {
129
+ auto it = column_index_.find(name);
130
+ if (it == column_index_.end()) {
131
+ throw ColumnNotFound(name);
132
+ }
133
+ return columns_[it->second];
134
+ }
135
+
136
+ DType Table::column_dtype(const std::string& name) const {
137
+ auto it = column_index_.find(name);
138
+ if (it != column_index_.end()) {
139
+ return columns_[it->second].dtype();
140
+ }
141
+ auto sit = string_column_index_.find(name);
142
+ if (sit != string_column_index_.end()) {
143
+ return DType::String;
144
+ }
145
+ throw ColumnNotFound(name);
146
+ }
147
+
148
+ std::vector<std::string> Table::column_names() const {
149
+ std::vector<std::string> names;
150
+ names.reserve(columns_.size() + string_columns_.size());
151
+ for (const auto& col : columns_) {
152
+ names.push_back(col.name());
153
+ }
154
+ for (const auto& col : string_columns_) {
155
+ names.push_back(col.name());
156
+ }
157
+ return names;
158
+ }
159
+
160
+ void Table::set_sorted_by(const std::string& col) {
161
+ if (!has_column(col)) {
162
+ throw ColumnNotFound(col);
163
+ }
164
+ sorted_by_ = col;
165
+ }
166
+
167
+ // --- Primary key + hash index ---
168
+
169
+ void Table::set_primary_key(const std::string& col_name) {
170
+ if (!has_column(col_name)) {
171
+ throw ColumnNotFound(col_name);
172
+ }
173
+ primary_key_ = col_name;
174
+ rebuild_index();
175
+ }
176
+
177
+ void Table::rebuild_index() {
178
+ if (!primary_key_) return;
179
+
180
+ pk_index_ = std::make_unique<HashIndex>();
181
+ DType pk_dtype = column_dtype(*primary_key_);
182
+
183
+ if (pk_dtype == DType::String) {
184
+ pk_index_->build_str(*this, *primary_key_);
185
+ } else if (pk_dtype == DType::Int64 || pk_dtype == DType::Timestamp || pk_dtype == DType::Decimal6) {
186
+ pk_index_->build_int(*this, *primary_key_);
187
+ } else {
188
+ throw InvalidOperation("Primary key must be String, Int64, Timestamp, or Decimal6");
189
+ }
190
+ }
191
+
192
+ std::optional<size_t> Table::find_row(int64_t key) const {
193
+ if (!pk_index_) return std::nullopt;
194
+ auto row = pk_index_->find_int(key);
195
+ if (row && !columns_.empty() && columns_[0].has_validity()) {
196
+ // Check validity of any fixed column
197
+ if (!columns_[0].is_valid(*row)) return std::nullopt;
198
+ }
199
+ return row;
200
+ }
201
+
202
+ std::optional<size_t> Table::find_row(std::string_view key) const {
203
+ if (!pk_index_) return std::nullopt;
204
+ auto row = pk_index_->find_str(key);
205
+ if (row) {
206
+ // Check validity via the PK string column itself
207
+ const auto& pk_col = string_column(*primary_key_);
208
+ if (pk_col.has_validity() && !pk_col.is_valid(*row)) return std::nullopt;
209
+ }
210
+ return row;
211
+ }
212
+
213
+ // --- CRUD operations ---
214
+
215
+ size_t Table::append_row(const std::unordered_map<std::string, std::any>& values) {
216
+ size_t row_idx = num_rows_;
217
+
218
+ // Append to each fixed-width column
219
+ for (auto& col : columns_) {
220
+ auto it = values.find(col.name());
221
+ if (it == values.end()) {
222
+ // Append default (zero) value
223
+ uint8_t zeros[8] = {};
224
+ col.append(zeros, dtype_size(col.dtype()));
225
+ col.ensure_validity();
226
+ col.set_valid(row_idx, false); // Mark as null
227
+ } else {
228
+ const auto& val = it->second;
229
+ DType dt = col.dtype();
230
+
231
+ if (dt == DType::Int64 || dt == DType::Timestamp || dt == DType::Decimal6) {
232
+ int64_t v = std::any_cast<int64_t>(val);
233
+ col.append(&v, sizeof(v));
234
+ } else if (dt == DType::Float64) {
235
+ double v = std::any_cast<double>(val);
236
+ col.append(&v, sizeof(v));
237
+ } else if (dt == DType::Symbol) {
238
+ uint32_t v = std::any_cast<uint32_t>(val);
239
+ col.append(&v, sizeof(v));
240
+ } else if (dt == DType::Bool) {
241
+ uint8_t v = std::any_cast<uint8_t>(val);
242
+ col.append(&v, sizeof(v));
243
+ }
244
+ }
245
+ }
246
+
247
+ // Append to each string column
248
+ for (auto& scol : string_columns_) {
249
+ auto it = values.find(scol.name());
250
+ if (it == values.end()) {
251
+ scol.append_null();
252
+ } else {
253
+ auto sv = std::any_cast<std::string>(it->second);
254
+ scol.append(sv);
255
+ }
256
+ }
257
+
258
+ ++num_rows_;
259
+
260
+ // Update index
261
+ if (pk_index_ && primary_key_) {
262
+ DType pk_dtype = column_dtype(*primary_key_);
263
+ auto it = values.find(*primary_key_);
264
+ if (it != values.end()) {
265
+ if (pk_dtype == DType::String) {
266
+ pk_index_->insert_str(std::any_cast<std::string>(it->second), row_idx);
267
+ } else {
268
+ pk_index_->insert_int(std::any_cast<int64_t>(it->second), row_idx);
269
+ }
270
+ }
271
+ }
272
+
273
+ return row_idx;
274
+ }
275
+
276
+ bool Table::update_row(int64_t pk, const std::unordered_map<std::string, std::any>& values) {
277
+ auto row = find_row(pk);
278
+ if (!row) return false;
279
+ return update_row_at(*row, values);
280
+ }
281
+
282
+ bool Table::update_row(std::string_view pk, const std::unordered_map<std::string, std::any>& values) {
283
+ auto row = find_row(pk);
284
+ if (!row) return false;
285
+ return update_row_at(*row, values);
286
+ }
287
+
288
+ bool Table::update_row_at(size_t row_idx, const std::unordered_map<std::string, std::any>& values) {
289
+ if (row_idx >= num_rows_) return false;
290
+
291
+ for (const auto& [col_name, val] : values) {
292
+ // Check if it's a string column
293
+ auto sit = string_column_index_.find(col_name);
294
+ if (sit != string_column_index_.end()) {
295
+ auto sv = std::any_cast<std::string>(val);
296
+ string_columns_[sit->second].set(row_idx, sv);
297
+ continue;
298
+ }
299
+
300
+ // Fixed-width column
301
+ auto it = column_index_.find(col_name);
302
+ if (it == column_index_.end()) continue; // Skip unknown columns
303
+
304
+ Column& col = columns_[it->second];
305
+ DType dt = col.dtype();
306
+
307
+ if (dt == DType::Int64 || dt == DType::Timestamp || dt == DType::Decimal6) {
308
+ int64_t v = std::any_cast<int64_t>(val);
309
+ col.set(row_idx, &v, sizeof(v));
310
+ } else if (dt == DType::Float64) {
311
+ double v = std::any_cast<double>(val);
312
+ col.set(row_idx, &v, sizeof(v));
313
+ } else if (dt == DType::Symbol) {
314
+ uint32_t v = std::any_cast<uint32_t>(val);
315
+ col.set(row_idx, &v, sizeof(v));
316
+ } else if (dt == DType::Bool) {
317
+ uint8_t v = std::any_cast<uint8_t>(val);
318
+ col.set(row_idx, &v, sizeof(v));
319
+ }
320
+ }
321
+
322
+ return true;
323
+ }
324
+
325
+ bool Table::delete_row(int64_t pk) {
326
+ auto row = find_row(pk);
327
+ if (!row) return false;
328
+
329
+ // Soft delete: set validity bit to 0 on all columns
330
+ for (auto& col : columns_) {
331
+ col.ensure_validity();
332
+ col.set_valid(*row, false);
333
+ }
334
+ for (auto& scol : string_columns_) {
335
+ scol.set_valid(*row, false);
336
+ }
337
+
338
+ // Remove from index
339
+ if (pk_index_) {
340
+ pk_index_->remove_int(pk);
341
+ }
342
+
343
+ return true;
344
+ }
345
+
346
+ bool Table::delete_row(std::string_view pk) {
347
+ auto row = find_row(pk);
348
+ if (!row) return false;
349
+
350
+ for (auto& col : columns_) {
351
+ col.ensure_validity();
352
+ col.set_valid(*row, false);
353
+ }
354
+ for (auto& scol : string_columns_) {
355
+ scol.set_valid(*row, false);
356
+ }
357
+
358
+ if (pk_index_) {
359
+ pk_index_->remove_str(pk);
360
+ }
361
+
362
+ return true;
363
+ }
364
+
365
+ // --- Filter ---
366
+
367
+ std::vector<size_t> Table::where_eq(const std::string& col_name, int64_t val) const {
368
+ std::vector<size_t> result;
369
+ auto it = column_index_.find(col_name);
370
+ if (it == column_index_.end()) throw ColumnNotFound(col_name);
371
+
372
+ const Column& col = columns_[it->second];
373
+ auto view = col.as<const int64_t>();
374
+ for (size_t i = 0; i < view.size(); ++i) {
375
+ if (col.is_valid(i) && view[i] == val) {
376
+ result.push_back(i);
377
+ }
378
+ }
379
+ return result;
380
+ }
381
+
382
+ std::vector<size_t> Table::where_eq(const std::string& col_name, std::string_view val) const {
383
+ std::vector<size_t> result;
384
+ auto sit = string_column_index_.find(col_name);
385
+ if (sit == string_column_index_.end()) throw ColumnNotFound(col_name);
386
+
387
+ const StringColumn& scol = string_columns_[sit->second];
388
+ for (size_t i = 0; i < scol.size(); ++i) {
389
+ if (scol.is_valid(i) && scol.get(i) == val) {
390
+ result.push_back(i);
391
+ }
392
+ }
393
+ return result;
394
+ }
395
+
396
+ // --- Compaction ---
397
+
398
+ void Table::compact() {
399
+ // Determine which rows are valid (check first available column)
400
+ std::vector<bool> keep(num_rows_, true);
401
+ bool any_deleted = false;
402
+
403
+ // Check fixed columns for validity
404
+ for (const auto& col : columns_) {
405
+ if (col.has_validity()) {
406
+ for (size_t i = 0; i < num_rows_; ++i) {
407
+ if (!col.is_valid(i)) {
408
+ keep[i] = false;
409
+ any_deleted = true;
410
+ }
411
+ }
412
+ break; // Only need to check one column
413
+ }
414
+ }
415
+
416
+ // Also check string columns
417
+ if (!any_deleted) {
418
+ for (const auto& scol : string_columns_) {
419
+ if (scol.has_validity()) {
420
+ for (size_t i = 0; i < scol.size(); ++i) {
421
+ if (!scol.is_valid(i)) {
422
+ keep[i] = false;
423
+ any_deleted = true;
424
+ }
425
+ }
426
+ break;
427
+ }
428
+ }
429
+ }
430
+
431
+ if (!any_deleted) return; // Nothing to compact
432
+
433
+ // Count new rows
434
+ size_t new_rows = 0;
435
+ for (bool k : keep) {
436
+ if (k) ++new_rows;
437
+ }
438
+
439
+ // Compact fixed columns
440
+ for (size_t ci = 0; ci < columns_.size(); ++ci) {
441
+ Column& col = columns_[ci];
442
+ size_t elem_size = dtype_size(col.dtype());
443
+ std::vector<uint8_t> new_data;
444
+ new_data.reserve(new_rows * elem_size);
445
+
446
+ const uint8_t* src = static_cast<const uint8_t*>(col.data());
447
+ for (size_t i = 0; i < num_rows_; ++i) {
448
+ if (keep[i]) {
449
+ new_data.insert(new_data.end(), src + i * elem_size, src + (i + 1) * elem_size);
450
+ }
451
+ }
452
+
453
+ // Replace column
454
+ std::string cname = col.name();
455
+ DType cdtype = col.dtype();
456
+ columns_[ci] = Column(std::move(cname), cdtype, std::move(new_data));
457
+ }
458
+
459
+ // Compact string columns
460
+ for (size_t si = 0; si < string_columns_.size(); ++si) {
461
+ StringColumn& scol = string_columns_[si];
462
+ StringColumn new_scol(scol.name());
463
+ for (size_t i = 0; i < scol.size(); ++i) {
464
+ if (keep[i]) {
465
+ if (scol.is_valid(i)) {
466
+ new_scol.append(scol.get(i));
467
+ } else {
468
+ new_scol.append_null();
469
+ }
470
+ }
471
+ }
472
+ string_columns_[si] = std::move(new_scol);
473
+ }
474
+
475
+ num_rows_ = new_rows;
476
+
477
+ // Rebuild index
478
+ rebuild_index();
479
+ }
480
+
481
+ // --- Persistence ---
482
+
483
+ void Table::save(const std::string& dir_path) const {
484
+ fs::create_directories(dir_path);
485
+
486
+ // Write metadata
487
+ write_metadata(dir_path);
488
+
489
+ // Write each fixed-width column
490
+ for (const auto& col : columns_) {
491
+ std::string col_path = dir_path + "/" + col.name() + ".col";
492
+ std::ofstream file(col_path, std::ios::binary);
493
+
494
+ if (!file) {
495
+ throw WayyException("Failed to create column file: " + col_path);
496
+ }
497
+
498
+ // Write header
499
+ ColumnHeader header{};
500
+ header.magic = WAYY_MAGIC;
501
+ header.version = WAYY_VERSION;
502
+ header.dtype = col.dtype();
503
+ header.row_count = col.size();
504
+ header.compression = 0;
505
+ header.data_offset = sizeof(ColumnHeader);
506
+
507
+ file.write(reinterpret_cast<const char*>(&header), sizeof(header));
508
+
509
+ // Write data
510
+ file.write(static_cast<const char*>(col.data()), col.byte_size());
511
+
512
+ // Write validity bitmap if present
513
+ if (col.has_validity()) {
514
+ std::string vpath = dir_path + "/" + col.name() + ".validity";
515
+ std::ofstream vf(vpath, std::ios::binary);
516
+ if (vf) {
517
+ const auto& bmap = col.validity_bitmap();
518
+ uint64_t sz = bmap.size();
519
+ vf.write(reinterpret_cast<const char*>(&sz), sizeof(sz));
520
+ vf.write(reinterpret_cast<const char*>(bmap.data()),
521
+ static_cast<std::streamsize>(sz));
522
+ }
523
+ }
524
+ }
525
+
526
+ // Write each string column
527
+ for (const auto& scol : string_columns_) {
528
+ scol.save(dir_path, scol.name());
529
+ }
530
+ }
531
+
532
+ void Table::write_metadata(const std::string& dir_path) const {
533
+ std::string meta_path = dir_path + "/_meta.json";
534
+ std::ofstream file(meta_path);
535
+
536
+ if (!file) {
537
+ throw WayyException("Failed to create metadata file: " + meta_path);
538
+ }
539
+
540
+ file << "{\n";
541
+ file << " \"version\": " << WAYY_VERSION << ",\n";
542
+ file << " \"name\": \"" << name_ << "\",\n";
543
+ file << " \"num_rows\": " << num_rows_ << ",\n";
544
+
545
+ if (sorted_by_) {
546
+ file << " \"sorted_by\": \"" << *sorted_by_ << "\",\n";
547
+ } else {
548
+ file << " \"sorted_by\": null,\n";
549
+ }
550
+
551
+ if (primary_key_) {
552
+ file << " \"primary_key\": \"" << *primary_key_ << "\",\n";
553
+ } else {
554
+ file << " \"primary_key\": null,\n";
555
+ }
556
+
557
+ file << " \"columns\": [\n";
558
+ size_t total_cols = columns_.size() + string_columns_.size();
559
+ size_t idx = 0;
560
+ for (const auto& col : columns_) {
561
+ file << " {\"name\": \"" << col.name()
562
+ << "\", \"dtype\": \"" << dtype_to_string(col.dtype()) << "\"}";
563
+ if (++idx < total_cols) file << ",";
564
+ file << "\n";
565
+ }
566
+ for (const auto& scol : string_columns_) {
567
+ file << " {\"name\": \"" << scol.name()
568
+ << "\", \"dtype\": \"string\"}";
569
+ if (++idx < total_cols) file << ",";
570
+ file << "\n";
571
+ }
572
+ file << " ]\n";
573
+ file << "}\n";
574
+ }
575
+
576
+ Table Table::load(const std::string& dir_path) {
577
+ auto [name, num_rows, sorted_by, primary_key, col_info] = read_metadata(dir_path);
578
+
579
+ Table table(name);
580
+
581
+ for (const auto& [col_name, dtype] : col_info) {
582
+ if (dtype == DType::String) {
583
+ // Load string column
584
+ table.add_string_column(StringColumn::load(dir_path, col_name));
585
+ } else {
586
+ // Load fixed-width column
587
+ std::string col_path = dir_path + "/" + col_name + ".col";
588
+ std::ifstream file(col_path, std::ios::binary);
589
+
590
+ if (!file) {
591
+ throw WayyException("Failed to open column file: " + col_path);
592
+ }
593
+
594
+ // Read header
595
+ ColumnHeader header;
596
+ file.read(reinterpret_cast<char*>(&header), sizeof(header));
597
+
598
+ if (header.magic != WAYY_MAGIC) {
599
+ throw WayyException("Invalid column file magic: " + col_path);
600
+ }
601
+
602
+ // Read data
603
+ size_t byte_size = header.row_count * dtype_size(header.dtype);
604
+ std::vector<uint8_t> data(byte_size);
605
+ file.read(reinterpret_cast<char*>(data.data()), byte_size);
606
+
607
+ Column col(col_name, header.dtype, std::move(data));
608
+
609
+ // Load validity bitmap if present
610
+ std::string vpath = dir_path + "/" + col_name + ".validity";
611
+ if (fs::exists(vpath)) {
612
+ std::ifstream vf(vpath, std::ios::binary);
613
+ if (vf) {
614
+ uint64_t sz = 0;
615
+ vf.read(reinterpret_cast<char*>(&sz), sizeof(sz));
616
+ std::vector<uint8_t> bitmap(sz);
617
+ vf.read(reinterpret_cast<char*>(bitmap.data()),
618
+ static_cast<std::streamsize>(sz));
619
+ col.set_validity_bitmap(std::move(bitmap));
620
+ }
621
+ }
622
+
623
+ table.add_column(std::move(col));
624
+ }
625
+ }
626
+
627
+ if (sorted_by) {
628
+ table.set_sorted_by(*sorted_by);
629
+ }
630
+
631
+ if (primary_key) {
632
+ table.set_primary_key(*primary_key);
633
+ }
634
+
635
+ return table;
636
+ }
637
+
638
+ Table Table::mmap(const std::string& dir_path) {
639
+ auto [name, num_rows, sorted_by, primary_key, col_info] = read_metadata(dir_path);
640
+
641
+ Table table(name);
642
+
643
+ for (const auto& [col_name, dtype] : col_info) {
644
+ if (dtype == DType::String) {
645
+ // String columns are loaded (not mmap'd) since they have complex structure
646
+ table.add_string_column(StringColumn::load(dir_path, col_name));
647
+ } else {
648
+ std::string col_path = dir_path + "/" + col_name + ".col";
649
+
650
+ MmapFile mmap_file(col_path, MmapFile::Mode::ReadOnly);
651
+
652
+ // Validate header
653
+ auto* header = static_cast<const ColumnHeader*>(mmap_file.data());
654
+ if (header->magic != WAYY_MAGIC) {
655
+ throw WayyException("Invalid column file magic: " + col_path);
656
+ }
657
+
658
+ // Create column pointing to mmap'd data
659
+ void* data_ptr = static_cast<uint8_t*>(mmap_file.data()) + header->data_offset;
660
+ Column col(col_name, header->dtype, data_ptr, header->row_count, false);
661
+
662
+ // Load validity bitmap (always into memory, small)
663
+ std::string vpath = dir_path + "/" + col_name + ".validity";
664
+ if (fs::exists(vpath)) {
665
+ std::ifstream vf(vpath, std::ios::binary);
666
+ if (vf) {
667
+ uint64_t sz = 0;
668
+ vf.read(reinterpret_cast<char*>(&sz), sizeof(sz));
669
+ std::vector<uint8_t> bitmap(sz);
670
+ vf.read(reinterpret_cast<char*>(bitmap.data()),
671
+ static_cast<std::streamsize>(sz));
672
+ col.set_validity_bitmap(std::move(bitmap));
673
+ }
674
+ }
675
+
676
+ table.add_column(std::move(col));
677
+
678
+ // Keep mmap file alive
679
+ table.mmap_files_.push_back(std::move(mmap_file));
680
+ }
681
+ }
682
+
683
+ if (sorted_by) {
684
+ table.set_sorted_by(*sorted_by);
685
+ }
686
+
687
+ if (primary_key) {
688
+ table.set_primary_key(*primary_key);
689
+ }
690
+
691
+ return table;
692
+ }
693
+
694
+ std::tuple<std::string, size_t, std::optional<std::string>,
695
+ std::optional<std::string>,
696
+ std::vector<std::pair<std::string, DType>>>
697
+ Table::read_metadata(const std::string& dir_path) {
698
+ std::string meta_path = dir_path + "/_meta.json";
699
+ std::ifstream file(meta_path);
700
+
701
+ if (!file) {
702
+ throw WayyException("Failed to open metadata file: " + meta_path);
703
+ }
704
+
705
+ // Simple JSON parsing (minimal implementation)
706
+ std::stringstream buffer;
707
+ buffer << file.rdbuf();
708
+ std::string json = buffer.str();
709
+
710
+ // Extract fields using simple string parsing
711
+ auto extract_string = [&json](const std::string& key) -> std::string {
712
+ std::string pattern = "\"" + key + "\": \"";
713
+ auto pos = json.find(pattern);
714
+ if (pos == std::string::npos) return "";
715
+ pos += pattern.size();
716
+ auto end = json.find("\"", pos);
717
+ return json.substr(pos, end - pos);
718
+ };
719
+
720
+ auto extract_int = [&json](const std::string& key) -> size_t {
721
+ std::string pattern = "\"" + key + "\": ";
722
+ auto pos = json.find(pattern);
723
+ if (pos == std::string::npos) return 0;
724
+ pos += pattern.size();
725
+ return std::stoull(json.substr(pos));
726
+ };
727
+
728
+ std::string name = extract_string("name");
729
+ size_t num_rows_val = extract_int("num_rows");
730
+
731
+ std::optional<std::string> sorted_by;
732
+ std::string sorted_str = extract_string("sorted_by");
733
+ if (!sorted_str.empty()) {
734
+ sorted_by = sorted_str;
735
+ }
736
+
737
+ std::optional<std::string> primary_key;
738
+ std::string pk_str = extract_string("primary_key");
739
+ if (!pk_str.empty()) {
740
+ primary_key = pk_str;
741
+ }
742
+
743
+ // Parse columns array
744
+ std::vector<std::pair<std::string, DType>> columns;
745
+ auto cols_start = json.find("\"columns\":");
746
+ if (cols_start != std::string::npos) {
747
+ auto arr_start = json.find("[", cols_start);
748
+ auto arr_end = json.find("]", arr_start);
749
+ std::string arr = json.substr(arr_start, arr_end - arr_start + 1);
750
+
751
+ size_t pos = 0;
752
+ while ((pos = arr.find("{", pos)) != std::string::npos) {
753
+ auto obj_end = arr.find("}", pos);
754
+ std::string obj = arr.substr(pos, obj_end - pos + 1);
755
+
756
+ // Extract name and dtype from object
757
+ auto name_pos = obj.find("\"name\": \"");
758
+ if (name_pos != std::string::npos) {
759
+ name_pos += 9;
760
+ auto name_end = obj.find("\"", name_pos);
761
+ std::string col_name = obj.substr(name_pos, name_end - name_pos);
762
+
763
+ auto dtype_pos = obj.find("\"dtype\": \"");
764
+ dtype_pos += 10;
765
+ auto dtype_end = obj.find("\"", dtype_pos);
766
+ std::string dtype_str = obj.substr(dtype_pos, dtype_end - dtype_pos);
767
+
768
+ columns.emplace_back(col_name, dtype_from_string(dtype_str));
769
+ }
770
+
771
+ pos = obj_end + 1;
772
+ }
773
+ }
774
+
775
+ return {name, num_rows_val, sorted_by, primary_key, columns};
776
+ }
777
+
778
+ } // namespace wayy_db
src/types.cpp ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/types.hpp"
2
+
3
+ #include <unordered_map>
4
+
5
+ namespace wayy_db {
6
+
7
+ DType dtype_from_string(std::string_view s) {
8
+ static const std::unordered_map<std::string_view, DType> map = {
9
+ {"int64", DType::Int64},
10
+ {"float64", DType::Float64},
11
+ {"timestamp", DType::Timestamp},
12
+ {"symbol", DType::Symbol},
13
+ {"bool", DType::Bool},
14
+ {"string", DType::String},
15
+ {"decimal6", DType::Decimal6},
16
+ };
17
+
18
+ auto it = map.find(s);
19
+ if (it == map.end()) {
20
+ throw WayyException("Unknown dtype: " + std::string(s));
21
+ }
22
+ return it->second;
23
+ }
24
+
25
+ } // namespace wayy_db
src/wal.cpp ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "wayy_db/wal.hpp"
2
+ #include "wayy_db/database.hpp"
3
+
4
+ #include <array>
5
+ #include <cstring>
6
+ #include <filesystem>
7
+
8
+ namespace fs = std::filesystem;
9
+
10
+ namespace wayy_db {
11
+
12
+ // Simple CRC32 (IEEE polynomial)
13
+ static const std::array<uint32_t, 256> crc32_table = [] {
14
+ std::array<uint32_t, 256> table{};
15
+ for (uint32_t i = 0; i < 256; ++i) {
16
+ uint32_t crc = i;
17
+ for (int j = 0; j < 8; ++j) {
18
+ crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320u : 0);
19
+ }
20
+ table[i] = crc;
21
+ }
22
+ return table;
23
+ }();
24
+
25
+ WriteAheadLog::WriteAheadLog(const std::string& db_path) {
26
+ fs::create_directories(db_path);
27
+ path_ = db_path + "/wal.bin";
28
+ open_for_append();
29
+ }
30
+
31
+ WriteAheadLog::~WriteAheadLog() {
32
+ if (file_.is_open()) {
33
+ file_.flush();
34
+ file_.close();
35
+ }
36
+ }
37
+
38
+ void WriteAheadLog::open_for_append() {
39
+ if (file_.is_open()) file_.close();
40
+ file_.open(path_, std::ios::binary | std::ios::app);
41
+ if (!file_) {
42
+ throw WayyException("Failed to open WAL file: " + path_);
43
+ }
44
+ }
45
+
46
+ uint32_t WriteAheadLog::crc32(const uint8_t* data, size_t len) {
47
+ uint32_t crc = 0xFFFFFFFF;
48
+ for (size_t i = 0; i < len; ++i) {
49
+ crc = crc32_table[(crc ^ data[i]) & 0xFF] ^ (crc >> 8);
50
+ }
51
+ return crc ^ 0xFFFFFFFF;
52
+ }
53
+
54
+ void WriteAheadLog::write_entry(WalOp op, const std::string& table, size_t row,
55
+ const std::vector<uint8_t>& payload) {
56
+ std::lock_guard<std::mutex> lock(mu_);
57
+
58
+ // Build the entry in a buffer for CRC calculation
59
+ std::vector<uint8_t> buf;
60
+ buf.reserve(4 + 1 + 4 + table.size() + 8 + 4 + payload.size());
61
+
62
+ // Magic
63
+ uint32_t magic = WAL_MAGIC;
64
+ buf.insert(buf.end(), reinterpret_cast<uint8_t*>(&magic),
65
+ reinterpret_cast<uint8_t*>(&magic) + 4);
66
+
67
+ // Op type
68
+ buf.push_back(static_cast<uint8_t>(op));
69
+
70
+ // Table name length + name
71
+ uint32_t tlen = static_cast<uint32_t>(table.size());
72
+ buf.insert(buf.end(), reinterpret_cast<uint8_t*>(&tlen),
73
+ reinterpret_cast<uint8_t*>(&tlen) + 4);
74
+ buf.insert(buf.end(), table.begin(), table.end());
75
+
76
+ // Row ID
77
+ uint64_t row_id = static_cast<uint64_t>(row);
78
+ buf.insert(buf.end(), reinterpret_cast<uint8_t*>(&row_id),
79
+ reinterpret_cast<uint8_t*>(&row_id) + 8);
80
+
81
+ // Payload length + payload
82
+ uint32_t plen = static_cast<uint32_t>(payload.size());
83
+ buf.insert(buf.end(), reinterpret_cast<uint8_t*>(&plen),
84
+ reinterpret_cast<uint8_t*>(&plen) + 4);
85
+ buf.insert(buf.end(), payload.begin(), payload.end());
86
+
87
+ // CRC32
88
+ uint32_t checksum = crc32(buf.data(), buf.size());
89
+ buf.insert(buf.end(), reinterpret_cast<uint8_t*>(&checksum),
90
+ reinterpret_cast<uint8_t*>(&checksum) + 4);
91
+
92
+ // Write to file
93
+ file_.write(reinterpret_cast<const char*>(buf.data()),
94
+ static_cast<std::streamsize>(buf.size()));
95
+ file_.flush();
96
+ }
97
+
98
+ void WriteAheadLog::log_insert(const std::string& table, size_t row,
99
+ const std::vector<uint8_t>& data) {
100
+ write_entry(WalOp::Insert, table, row, data);
101
+ }
102
+
103
+ void WriteAheadLog::log_update(const std::string& table, size_t row,
104
+ const std::string& col, const std::vector<uint8_t>& data) {
105
+ // Encode column name + data as payload
106
+ std::vector<uint8_t> payload;
107
+ uint32_t clen = static_cast<uint32_t>(col.size());
108
+ payload.insert(payload.end(), reinterpret_cast<uint8_t*>(&clen),
109
+ reinterpret_cast<uint8_t*>(&clen) + 4);
110
+ payload.insert(payload.end(), col.begin(), col.end());
111
+ payload.insert(payload.end(), data.begin(), data.end());
112
+ write_entry(WalOp::Update, table, row, payload);
113
+ }
114
+
115
+ void WriteAheadLog::log_delete(const std::string& table, size_t row) {
116
+ write_entry(WalOp::Delete, table, row, {});
117
+ }
118
+
119
+ void WriteAheadLog::checkpoint(Database& db) {
120
+ std::lock_guard<std::mutex> lock(mu_);
121
+
122
+ // Flush and close WAL
123
+ if (file_.is_open()) {
124
+ file_.flush();
125
+ file_.close();
126
+ }
127
+
128
+ // Save all tables to disk
129
+ db.save();
130
+
131
+ // Truncate WAL (start fresh)
132
+ file_.open(path_, std::ios::binary | std::ios::trunc);
133
+ if (!file_) {
134
+ throw WayyException("Failed to truncate WAL: " + path_);
135
+ }
136
+ }
137
+
138
+ void WriteAheadLog::replay(Database& db) {
139
+ if (!fs::exists(path_)) return;
140
+
141
+ std::ifstream wal(path_, std::ios::binary);
142
+ if (!wal) return;
143
+
144
+ // Get file size
145
+ wal.seekg(0, std::ios::end);
146
+ auto file_size = wal.tellg();
147
+ if (file_size <= 0) return;
148
+ wal.seekg(0, std::ios::beg);
149
+
150
+ size_t entries_replayed = 0;
151
+
152
+ while (wal.good() && wal.tellg() < file_size) {
153
+ auto entry_start = wal.tellg();
154
+
155
+ // Read magic
156
+ uint32_t magic = 0;
157
+ wal.read(reinterpret_cast<char*>(&magic), 4);
158
+ if (magic != WAL_MAGIC) break; // Corrupt or end of valid entries
159
+
160
+ // Read op
161
+ uint8_t op_byte = 0;
162
+ wal.read(reinterpret_cast<char*>(&op_byte), 1);
163
+ auto op = static_cast<WalOp>(op_byte);
164
+
165
+ // Read table name
166
+ uint32_t tlen = 0;
167
+ wal.read(reinterpret_cast<char*>(&tlen), 4);
168
+ std::string table_name(tlen, '\0');
169
+ wal.read(table_name.data(), tlen);
170
+
171
+ // Read row ID
172
+ uint64_t row_id = 0;
173
+ wal.read(reinterpret_cast<char*>(&row_id), 8);
174
+
175
+ // Read payload
176
+ uint32_t plen = 0;
177
+ wal.read(reinterpret_cast<char*>(&plen), 4);
178
+ std::vector<uint8_t> payload(plen);
179
+ if (plen > 0) {
180
+ wal.read(reinterpret_cast<char*>(payload.data()), plen);
181
+ }
182
+
183
+ // Read CRC
184
+ uint32_t stored_crc = 0;
185
+ wal.read(reinterpret_cast<char*>(&stored_crc), 4);
186
+
187
+ // Verify CRC (re-read the entry from start to before CRC)
188
+ auto entry_end = wal.tellg();
189
+ size_t entry_size = static_cast<size_t>(entry_end - entry_start) - 4; // Exclude CRC
190
+ wal.seekg(entry_start);
191
+ std::vector<uint8_t> entry_data(entry_size);
192
+ wal.read(reinterpret_cast<char*>(entry_data.data()), entry_size);
193
+ wal.seekg(entry_end); // Skip past CRC we already read
194
+
195
+ uint32_t computed_crc = crc32(entry_data.data(), entry_data.size());
196
+ if (computed_crc != stored_crc) {
197
+ break; // Corrupt entry, stop replay
198
+ }
199
+
200
+ // Apply operation (best-effort: skip if table doesn't exist)
201
+ // The actual replay logic depends on the table having been loaded.
202
+ // For now, we just count replayed entries. Full replay requires
203
+ // deserializing the payload and calling table CRUD methods.
204
+ // TODO: Implement full row-level replay when table schema is available.
205
+ (void)op;
206
+ (void)row_id;
207
+ (void)table_name;
208
+
209
+ ++entries_replayed;
210
+ }
211
+
212
+ // After replay, truncate WAL
213
+ wal.close();
214
+ if (entries_replayed > 0) {
215
+ // Re-save state and clear WAL
216
+ std::ofstream truncate(path_, std::ios::binary | std::ios::trunc);
217
+ }
218
+ }
219
+
220
+ bool WriteAheadLog::has_entries() const {
221
+ if (!fs::exists(path_)) return false;
222
+ return fs::file_size(path_) > 0;
223
+ }
224
+
225
+ } // namespace wayy_db