Add files using upload-large-folder tool
Browse files- .gitattributes +2 -32
- .gitignore +50 -0
- .train_stage1_done +0 -0
- Dockerfile +29 -0
- Plan.MD +0 -0
- README.md +469 -3
- configs/awl_config.yaml +13 -0
- configs/inference_config.yaml +32 -0
- configs/model_config.yaml +35 -0
- configs/training_config.yaml +70 -0
- configs/training_config_fast.yaml +82 -0
- docker-compose.yml +21 -0
- graph_codebase.py +859 -0
- graphify-out/GRAPH_REPORT.md +252 -0
- graphify-out/cost.json +36 -0
- graphify-out/graph.html +0 -0
- graphify-out/graph.json +0 -0
- graphify-out/manifest.json +444 -0
- pyproject.toml +42 -0
- requirements-dev.txt +9 -0
- requirements.txt +59 -0
- scripts/download_all_huggingface_datasets.py +61 -0
- scripts/download_datasets.sh +31 -0
- scripts/download_kaggle_datasets.sh +41 -0
- scripts/evaluate.py +85 -0
- scripts/preprocess_data.py +206 -0
- scripts/pretrain_human_pattern_classifier.py +201 -0
- scripts/run_inference.py +59 -0
- scripts/train.py +390 -0
- src/__init__.py +0 -0
- start.sh +123 -0
- tests/test_evaluation.py +46 -0
- tests/test_model.py +44 -0
- tests/test_preprocessing.py +82 -0
- tests/test_style.py +47 -0
- tests/test_vocabulary.py +38 -0
- todo_registry.md +335 -0
- train.sh +215 -0
- wandb/debug-internal.log +578 -0
- wandb/debug.log +24 -0
- wandb/run-20260502_165105-pwnhqrrf/run-pwnhqrrf.wandb +0 -0
- wandb/run-20260502_165541-4d797dih/run-4d797dih.wandb +0 -0
- wandb/run-20260502_201947-ngpyijum/run-ngpyijum.wandb +0 -0
- wandb/run-20260502_202439-7n7pnref/run-7n7pnref.wandb +0 -0
- wandb/run-20260502_203519-fib23yhh/run-fib23yhh.wandb +0 -0
- wandb/run-20260502_210534-j0t4q38m/run-j0t4q38m.wandb +0 -0
- wandb/run-20260502_212127-vl8pftkj/run-vl8pftkj.wandb +0 -0
- wandb/run-20260502_213822-mmm9bdu9/run-mmm9bdu9.wandb +0 -0
- wandb/run-20260503_120130-xzkygl93/run-xzkygl93.wandb +0 -0
- wandb/run-20260503_120403-cbb6slr5/run-cbb6slr5.wandb +0 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,5 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
|
| 27 |
-
*
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
| 1 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 3 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
checkpoints/** filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.egg-info/
|
| 6 |
+
dist/
|
| 7 |
+
build/
|
| 8 |
+
*.egg
|
| 9 |
+
|
| 10 |
+
# Virtual environment
|
| 11 |
+
venv/
|
| 12 |
+
.venv/
|
| 13 |
+
env/
|
| 14 |
+
|
| 15 |
+
# IDE
|
| 16 |
+
.vscode/
|
| 17 |
+
.idea/
|
| 18 |
+
*.swp
|
| 19 |
+
*.swo
|
| 20 |
+
|
| 21 |
+
# Data (large files)
|
| 22 |
+
data/raw/
|
| 23 |
+
data/processed/
|
| 24 |
+
!data/awl/
|
| 25 |
+
|
| 26 |
+
# Model checkpoints
|
| 27 |
+
checkpoints/
|
| 28 |
+
*.pt
|
| 29 |
+
*.pth
|
| 30 |
+
*.bin
|
| 31 |
+
*.safetensors
|
| 32 |
+
|
| 33 |
+
# Logs
|
| 34 |
+
logs/
|
| 35 |
+
wandb/
|
| 36 |
+
*.log
|
| 37 |
+
|
| 38 |
+
# OS
|
| 39 |
+
.DS_Store
|
| 40 |
+
Thumbs.db
|
| 41 |
+
|
| 42 |
+
# Jupyter
|
| 43 |
+
.ipynb_checkpoints/
|
| 44 |
+
|
| 45 |
+
# Environment
|
| 46 |
+
.env
|
| 47 |
+
*.env
|
| 48 |
+
|
| 49 |
+
# Docker
|
| 50 |
+
.dockerignore
|
.train_stage1_done
ADDED
|
File without changes
|
Dockerfile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# System dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
git \
|
| 6 |
+
curl \
|
| 7 |
+
default-jre \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
# Install Python dependencies
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
# Download spaCy model
|
| 17 |
+
RUN python -m spacy download en_core_web_trf
|
| 18 |
+
|
| 19 |
+
# Download NLTK data
|
| 20 |
+
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"
|
| 21 |
+
|
| 22 |
+
# Copy application
|
| 23 |
+
COPY . .
|
| 24 |
+
|
| 25 |
+
# Expose API port
|
| 26 |
+
EXPOSE 8000
|
| 27 |
+
|
| 28 |
+
# Default: run the API server
|
| 29 |
+
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
Plan.MD
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1,3 +1,469 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
tags:
|
| 5 |
+
- text2text-generation
|
| 6 |
+
- dyslexia
|
| 7 |
+
- grammar-correction
|
| 8 |
+
- style-preservation
|
| 9 |
+
- lora
|
| 10 |
+
- flan-t5
|
| 11 |
+
license: mit
|
| 12 |
+
base_model: google/flan-t5-small
|
| 13 |
+
datasets:
|
| 14 |
+
- cambridge/fce
|
| 15 |
+
- wi_locness
|
| 16 |
+
- jfleg
|
| 17 |
+
pipeline_tag: translation
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# Dyslexia Academic Writing Correction System
|
| 21 |
+
|
| 22 |
+
> **A style-preserving, grammar-correcting, academic vocabulary elevating AI system that corrects dyslectic writing while maintaining the author's personal voice, tone, and authorship signal — not a rewriter, a corrector.**
|
| 23 |
+
|
| 24 |
+
## Overview
|
| 25 |
+
|
| 26 |
+
This system takes text written by dyslexic students and corrects grammar, spelling, and fluency errors while:
|
| 27 |
+
|
| 28 |
+
1. **Preserving the author's unique writing style** via a 512-dimensional style fingerprint vector
|
| 29 |
+
2. **Elevating vocabulary to academic register** using Coxhead's Academic Word List (AWL) and BERT-based lexical substitution
|
| 30 |
+
3. **Resisting AI detection** through a frozen Human Pattern Classifier that penalises AI-typical writing during training
|
| 31 |
+
4. **Maintaining semantic meaning** with cosine-similarity-based semantic preservation loss
|
| 32 |
+
|
| 33 |
+
The core model is **Google Flan-T5-Small** fine-tuned with **LoRA** (Low-Rank Adaptation), trained on real learner error corpora (FCE, W&I+LOCNESS, JFLEG) augmented with synthetic dyslexia-simulated data.
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Features
|
| 38 |
+
|
| 39 |
+
| Feature | Description |
|
| 40 |
+
|---------|-------------|
|
| 41 |
+
| **Two-pass spell correction** | Dyslexia-aware phonetic pattern handling via LanguageTool |
|
| 42 |
+
| **Style fingerprinting** | 41 raw features → MLP → 512-dim L2-normalised style vector |
|
| 43 |
+
| **LoRA fine-tuning** | 1.63% trainable params (1.28M / 78.2M total), rank=8 |
|
| 44 |
+
| **Academic vocabulary elevation** | BERT fill-mask → AWL candidate filtering → semantic similarity gate |
|
| 45 |
+
| **Human pattern anti-AI loss** | Pre-trained frozen MLP classifier (17-dim features including GPT-2 perplexity) |
|
| 46 |
+
| **Combined training loss** | `L_CE + λ₁·L_style + λ₂·L_semantic + λ₃·L_human_pattern` |
|
| 47 |
+
| **Sentence-chunked inference** | Long texts split into 128-token chunks matching training window |
|
| 48 |
+
| **FastAPI server** | RESTful `/correct` endpoint with CORS and rate limiting |
|
| 49 |
+
| **Multi-stage training** | Orchestrated via `train.sh` with checkpoint system (Skip/Redo/Continue) |
|
| 50 |
+
| **Synthetic data augmentation** | `DyslexiaSimulator` generates realistic errors from clean text |
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## Project Structure
|
| 55 |
+
|
| 56 |
+
```
|
| 57 |
+
Rewriter/
|
| 58 |
+
├── configs/
|
| 59 |
+
│ ├── training_config.yaml # Full training hyperparameters
|
| 60 |
+
│ ├── training_config_fast.yaml # Quick iteration config
|
| 61 |
+
│ ├── inference_config.yaml # Inference & generation settings
|
| 62 |
+
│ ├── model_config.yaml # Model architecture registry
|
| 63 |
+
│ └── awl_config.yaml # Academic Word List settings
|
| 64 |
+
├── scripts/
|
| 65 |
+
│ ├── train.py # Main training script (Click CLI)
|
| 66 |
+
│ ├── evaluate.py # Test set evaluation (GLEU, ERRANT, BERTScore)
|
| 67 |
+
│ ├── run_inference.py # Interactive CLI inference
|
| 68 |
+
│ ├── preprocess_data.py # Raw datasets → unified JSONL
|
| 69 |
+
│ ├── pretrain_human_pattern_classifier.py # Stage 3: anti-AI classifier
|
| 70 |
+
│ ├── download_datasets.sh # BEA-2019 dataset downloader
|
| 71 |
+
│ └── download_kaggle_datasets.sh # Kaggle human/AI data downloader
|
| 72 |
+
├── src/
|
| 73 |
+
│ ├── model/
|
| 74 |
+
│ │ ├── base_model.py # Model loader (T5/BART/Llama + LoRA + quantization)
|
| 75 |
+
│ │ ├── style_conditioner.py # Prefix tuning: style → virtual tokens
|
| 76 |
+
│ │ ├── generation_utils.py # Beam search, sampling, batch generation
|
| 77 |
+
│ │ └── lora_adapter.py # LoRA configuration helpers
|
| 78 |
+
│ ├── preprocessing/
|
| 79 |
+
│ │ ├── pipeline.py # Full preprocessing orchestrator
|
| 80 |
+
│ │ ├── spell_corrector.py # LanguageTool + dyslexia-aware correction
|
| 81 |
+
│ │ ├── dyslexia_simulator.py # Synthetic error generation (Rello et al.)
|
| 82 |
+
│ │ ├── dependency_parser.py # spaCy dependency tree analysis
|
| 83 |
+
│ │ ├── ner_tagger.py # Named entity protection
|
| 84 |
+
│ │ └── sentence_segmenter.py # Sentence boundary detection
|
| 85 |
+
│ ├── style/
|
| 86 |
+
│ │ ├── fingerprinter.py # 41 features → 512-dim style vector
|
| 87 |
+
│ │ ├── style_vector.py # Style vector dataclass
|
| 88 |
+
│ │ ├── formality_classifier.py # Rule-based formality scoring
|
| 89 |
+
│ │ └── emotion_classifier.py # Emotion detection
|
| 90 |
+
│ ├── training/
|
| 91 |
+
│ │ ├── dataset.py # Pre-tokenized cached dataset with style vectors
|
| 92 |
+
│ │ ├── trainer.py # CorrectionTrainer (HF Trainer + PEFT fixes)
|
| 93 |
+
│ │ ├── loss_functions.py # V1 and V2 combined losses
|
| 94 |
+
│ │ ├── human_pattern_extractor.py # 17-dim feature extraction + classifier
|
| 95 |
+
│ │ └─�� callbacks.py # Evaluation logging callbacks
|
| 96 |
+
│ ├── vocabulary/
|
| 97 |
+
│ │ ├── lexical_substitution.py # BERT fill-mask → AWL substitution pipeline
|
| 98 |
+
│ │ ├── awl_loader.py # Coxhead Academic Word List loader
|
| 99 |
+
│ │ └── register_filter.py # Contraction expansion + colloquial replacement
|
| 100 |
+
│ ├── inference/
|
| 101 |
+
│ │ ├── corrector.py # End-to-end inference pipeline orchestrator
|
| 102 |
+
│ │ └── postprocessor.py # Cleanup, entity restore, formatting
|
| 103 |
+
│ ├── evaluation/
|
| 104 |
+
│ │ ├── gleu_scorer.py # GLEU + BERTScore computation
|
| 105 |
+
│ │ ├── errant_evaluator.py # ERRANT P/R/F0.5 evaluation
|
| 106 |
+
│ │ ├── style_metrics.py # Style similarity + AWL coverage
|
| 107 |
+
│ │ └── authorship_verifier.py # AI detection resistance testing
|
| 108 |
+
│ └── api/
|
| 109 |
+
│ ├── main.py # FastAPI application
|
| 110 |
+
│ ├── schemas.py # Pydantic request/response models
|
| 111 |
+
│ └── middleware.py # Rate limiting + CORS
|
| 112 |
+
├── data/
|
| 113 |
+
│ ├── raw/ # Original datasets (FCE, W&I+LOCNESS, JFLEG, Kaggle)
|
| 114 |
+
│ ├── processed/ # Unified JSONL (train/val/test splits)
|
| 115 |
+
│ ├── cache/ # Pre-tokenized dataset caches (.pt files)
|
| 116 |
+
│ └── awl/ # Coxhead Academic Word List
|
| 117 |
+
├── train.sh # Multi-stage training orchestrator
|
| 118 |
+
├── start.sh # Inference launcher (CLI or API mode)
|
| 119 |
+
├── Dockerfile # Production container
|
| 120 |
+
├── docker-compose.yml # Docker deployment
|
| 121 |
+
├── requirements.txt # Python dependencies
|
| 122 |
+
└── pyproject.toml # Project metadata
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## Design Choices & Rationale
|
| 128 |
+
|
| 129 |
+
### Why Flan-T5-Small?
|
| 130 |
+
|
| 131 |
+
| Consideration | Decision |
|
| 132 |
+
|---------------|----------|
|
| 133 |
+
| **Hardware constraint** | RTX 3050 Laptop GPU (4GB VRAM) — rules out models > 500M params |
|
| 134 |
+
| **Architecture** | Encoder-decoder (seq2seq) is ideal for text-to-text correction tasks |
|
| 135 |
+
| **Instruction tuning** | Flan-T5 is pre-trained on 1,800+ instruction tasks — follows correction prompts naturally |
|
| 136 |
+
| **LoRA efficiency** | Only 1.28M trainable params (1.63%) — fits in 4GB with batch_size=4 + bf16 |
|
| 137 |
+
|
| 138 |
+
### Why LoRA over Full Fine-Tuning?
|
| 139 |
+
|
| 140 |
+
- **Memory**: Full fine-tuning of T5-Small requires ~2.5GB for gradients alone; LoRA needs ~200MB
|
| 141 |
+
- **Speed**: LoRA converges in 5 epochs (~1,515 steps) on a single RTX 3050
|
| 142 |
+
- **Merging**: LoRA weights merge into base model at inference time — zero latency overhead
|
| 143 |
+
- **Configuration**: `r=8, alpha=16, dropout=0.05`, targeting all attention + FFN projections (`q, k, v, o, wi_0, wi_1, wo`)
|
| 144 |
+
|
| 145 |
+
### Why a Combined Multi-Objective Loss?
|
| 146 |
+
|
| 147 |
+
The system uses a 4-term loss function: `L = L_CE + 0.3·L_style + 0.5·L_semantic + 0.4·L_human`
|
| 148 |
+
|
| 149 |
+
| Term | Purpose | Weight |
|
| 150 |
+
|------|---------|--------|
|
| 151 |
+
| `L_CE` | Standard cross-entropy token prediction | 1.0 |
|
| 152 |
+
| `L_style` | `1 - cos_sim(output_style, input_style)` — preserves writing fingerprint | 0.3 |
|
| 153 |
+
| `L_semantic` | `1 - cos_sim(input_embedding, output_embedding)` — preserves meaning | 0.5 |
|
| 154 |
+
| `L_human` | `1 - HumanPatternClassifier(output)` — penalises AI-like text patterns | 0.4 |
|
| 155 |
+
|
| 156 |
+
**Why these weights?** Style and human-pattern losses are auxiliary signals — too high and they override grammar correction. The semantic loss is weighted highest (0.5) because meaning preservation is the hardest constraint to satisfy.
|
| 157 |
+
|
| 158 |
+
### Why a Human Pattern Classifier?
|
| 159 |
+
|
| 160 |
+
AI-generated text has detectable statistical signatures:
|
| 161 |
+
- **Lower GPT-2 perplexity** (AI text is more "predictable")
|
| 162 |
+
- **Lower burstiness** (AI has uniform sentence lengths; humans vary)
|
| 163 |
+
- **Higher AI marker density** (overuse of "delve", "leverage", "furthermore")
|
| 164 |
+
- **Lower n-gram novelty** (AI reuses phrases more)
|
| 165 |
+
|
| 166 |
+
The classifier is a 3-layer MLP (17→128→64→1) pre-trained on ~100k samples from two Kaggle datasets (Shanegerami AI_Human.csv + Starblasters8), then **frozen** during main training. Its output score (0=AI, 1=human) is used as a reward signal.
|
| 167 |
+
|
| 168 |
+
### Why Sentence-Chunked Inference?
|
| 169 |
+
|
| 170 |
+
The model was trained with `max_input_length=128` tokens. The task prefix alone consumes ~40 tokens, leaving ~86 tokens for actual text. Long inputs are:
|
| 171 |
+
|
| 172 |
+
1. Split into sentences using spaCy
|
| 173 |
+
2. Grouped into chunks that fit the 128-token budget
|
| 174 |
+
3. Each chunk is corrected independently
|
| 175 |
+
4. Results are joined back together
|
| 176 |
+
|
| 177 |
+
This prevents the model from seeing out-of-distribution input lengths and avoids truncation artifacts.
|
| 178 |
+
|
| 179 |
+
### Why Post-Generation Vocabulary Elevation?
|
| 180 |
+
|
| 181 |
+
Rather than relying solely on the model to produce academic vocabulary (which T5-Small lacks the capacity for), we apply a separate **BERT-based lexical substitution** pipeline:
|
| 182 |
+
|
| 183 |
+
1. POS-tag the output with spaCy
|
| 184 |
+
2. Identify non-AWL content words (nouns, verbs, adjectives, adverbs)
|
| 185 |
+
3. Mask each candidate → run BERT fill-mask → filter to AWL-only predictions
|
| 186 |
+
4. Accept substitution only if `semantic_similarity > 0.82` (measured with `all-mpnet-base-v2`)
|
| 187 |
+
5. Track used substitutions to prevent duplicate replacements
|
| 188 |
+
|
| 189 |
+
---
|
| 190 |
+
|
| 191 |
+
## Quick Start
|
| 192 |
+
|
| 193 |
+
### Prerequisites
|
| 194 |
+
|
| 195 |
+
- Python ≥ 3.10
|
| 196 |
+
- NVIDIA GPU with ≥ 4GB VRAM (or CPU, slower)
|
| 197 |
+
- ~10GB disk space for models and datasets
|
| 198 |
+
|
| 199 |
+
### Option A: Automated Training Pipeline
|
| 200 |
+
|
| 201 |
+
```bash
|
| 202 |
+
# Clone and setup
|
| 203 |
+
git clone https://huggingface.co/morpheuslord/rewriter && cd rewriter
|
| 204 |
+
pip install -r requirements.txt
|
| 205 |
+
|
| 206 |
+
# Set W&B key (optional, for experiment tracking)
|
| 207 |
+
export WANDB_API_KEY="your-key-here"
|
| 208 |
+
|
| 209 |
+
# Run the full 5-stage pipeline
|
| 210 |
+
bash train.sh
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
The orchestrator handles: **Setup → Preprocessing → Human Pattern Pre-training → Model Training → Evaluation**
|
| 214 |
+
|
| 215 |
+
Each stage has a checkpoint system — if interrupted, re-run `train.sh` and select `[S]kip` for completed stages.
|
| 216 |
+
|
| 217 |
+
### Option B: Manual Step-by-Step
|
| 218 |
+
|
| 219 |
+
```bash
|
| 220 |
+
# 1. Install dependencies
|
| 221 |
+
pip install -r requirements.txt
|
| 222 |
+
python -m spacy download en_core_web_sm
|
| 223 |
+
|
| 224 |
+
# 2. Preprocess datasets (FCE, W&I+LOCNESS, JFLEG → unified JSONL)
|
| 225 |
+
python scripts/preprocess_data.py
|
| 226 |
+
|
| 227 |
+
# 3. Pre-train the human pattern classifier
|
| 228 |
+
python scripts/pretrain_human_pattern_classifier.py
|
| 229 |
+
|
| 230 |
+
# 4. Train the correction model
|
| 231 |
+
PYTHONPATH=. python scripts/train.py --config configs/training_config.yaml --use-v2-loss
|
| 232 |
+
|
| 233 |
+
# 5. Merge LoRA adapter into base model for inference
|
| 234 |
+
python -c "
|
| 235 |
+
from peft import PeftModel
|
| 236 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 237 |
+
import torch
|
| 238 |
+
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small', torch_dtype=torch.bfloat16)
|
| 239 |
+
model = PeftModel.from_pretrained(model, 'checkpoints/checkpoint-BEST')
|
| 240 |
+
model = model.merge_and_unload()
|
| 241 |
+
model.save_pretrained('checkpoints/best_model_merged')
|
| 242 |
+
AutoTokenizer.from_pretrained('google/flan-t5-small').save_pretrained('checkpoints/best_model_merged')
|
| 243 |
+
"
|
| 244 |
+
|
| 245 |
+
# 6. Run inference
|
| 246 |
+
PYTHONPATH=. python scripts/run_inference.py --text "The studnet recieved alot of informtion."
|
| 247 |
+
|
| 248 |
+
# 7. Or start the API server
|
| 249 |
+
PYTHONPATH=. python -m uvicorn src.api.main:app --host 0.0.0.0 --port 8000
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
---
|
| 253 |
+
|
| 254 |
+
## Training Pipeline (5 Stages)
|
| 255 |
+
|
| 256 |
+
### Stage 1: Setup & Dependencies
|
| 257 |
+
Installs Python packages, downloads spaCy models (`en_core_web_sm`), and NLTK tokenizers.
|
| 258 |
+
|
| 259 |
+
### Stage 2: Data Preprocessing
|
| 260 |
+
Converts raw datasets into unified JSONL format:
|
| 261 |
+
|
| 262 |
+
| Dataset | Source | Format | Pairs |
|
| 263 |
+
|---------|--------|--------|-------|
|
| 264 |
+
| **FCE v2.1** | BEA-2019 Shared Task | Character-level edits | ~28k |
|
| 265 |
+
| **W&I+LOCNESS v2.1** | BEA-2019 Shared Task | Character-level edits | ~34k |
|
| 266 |
+
| **JFLEG** | Johns Hopkins | 4 reference corrections per source | ~5k |
|
| 267 |
+
|
| 268 |
+
Output schema: `{"input": "erroneous text", "target": "corrected text", "source": "fce|wi_locness|jfleg"}`
|
| 269 |
+
|
| 270 |
+
Split: 90% train / 10% validation (with 50% of validation used as test, capped at 500).
|
| 271 |
+
|
| 272 |
+
### Stage 3: Human Pattern Classifier Pre-Training
|
| 273 |
+
Trains a frozen binary MLP classifier on ~100k human vs AI text samples. Uses 17 features:
|
| 274 |
+
|
| 275 |
+
```
|
| 276 |
+
[perplexity, burstiness, sentence_starter_diversity,
|
| 277 |
+
bigram_novelty, trigram_novelty, 4gram_novelty,
|
| 278 |
+
ai_marker_density, overused_discourse_density,
|
| 279 |
+
em_dash_rate, ellipsis_rate, comma_rate, semicolon_rate,
|
| 280 |
+
word_count, sentence_count, mean_sent_length, std_sent_length, ttr]
|
| 281 |
+
```
|
| 282 |
+
|
| 283 |
+
GPT-2 perplexity is computed in batched GPU forward passes. Text features are extracted in parallel via `ProcessPoolExecutor`.
|
| 284 |
+
|
| 285 |
+
### Stage 4: Main Model Training
|
| 286 |
+
Fine-tunes Flan-T5-Small with LoRA using the V2 combined loss. Key hyperparameters:
|
| 287 |
+
|
| 288 |
+
| Parameter | Value |
|
| 289 |
+
|-----------|-------|
|
| 290 |
+
| Effective batch size | 32 (4 × 8 gradient accumulation) |
|
| 291 |
+
| Learning rate | 3e-4 (cosine schedule, 5% warmup) |
|
| 292 |
+
| Precision | bf16 (Ampere+ GPUs) |
|
| 293 |
+
| Max input tokens | 128 |
|
| 294 |
+
| Max target tokens | 128 |
|
| 295 |
+
| Epochs | 5 |
|
| 296 |
+
| Eval/Save interval | Every 100 steps |
|
| 297 |
+
|
| 298 |
+
### Stage 5: Evaluation
|
| 299 |
+
Runs on test set with metrics: GLEU, BERTScore F1, ERRANT F0.5, Style Similarity, AWL Coverage.
|
| 300 |
+
|
| 301 |
+
---
|
| 302 |
+
|
| 303 |
+
## Inference Pipeline (7 Steps)
|
| 304 |
+
|
| 305 |
+
```
|
| 306 |
+
Raw Text
|
| 307 |
+
│
|
| 308 |
+
▼
|
| 309 |
+
1. Preprocessing ─────── LanguageTool spell correction + spaCy parsing
|
| 310 |
+
│
|
| 311 |
+
▼
|
| 312 |
+
2. Style Fingerprinting ─ Extract 41 features → MLP → 512-dim vector
|
| 313 |
+
│
|
| 314 |
+
▼
|
| 315 |
+
3. Sentence-Chunked Generation ─ Split into 128-token chunks → Flan-T5 → rejoin
|
| 316 |
+
│
|
| 317 |
+
▼
|
| 318 |
+
4. Post-Processing ───── Remove artifacts, replace em dashes, fix spacing
|
| 319 |
+
│
|
| 320 |
+
▼
|
| 321 |
+
5. Vocabulary Elevation ─ BERT fill-mask → AWL filtering → semantic gate
|
| 322 |
+
│
|
| 323 |
+
▼
|
| 324 |
+
6. Register Filtering ── Expand contractions, replace colloquialisms
|
| 325 |
+
│
|
| 326 |
+
▼
|
| 327 |
+
7. Metrics ──────────── Style similarity, AWL coverage, readability scores
|
| 328 |
+
│
|
| 329 |
+
▼
|
| 330 |
+
Corrected Text
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
---
|
| 334 |
+
|
| 335 |
+
## Configuration Reference
|
| 336 |
+
|
| 337 |
+
### `configs/training_config.yaml`
|
| 338 |
+
|
| 339 |
+
```yaml
|
| 340 |
+
model:
|
| 341 |
+
key: "flan-t5-small" # flan-t5-xl | flan-t5-large | flan-t5-base | flan-t5-small
|
| 342 |
+
quantize: false # 4-bit NF4 quantization (needs GPU)
|
| 343 |
+
use_lora: true # Parameter-efficient fine-tuning
|
| 344 |
+
|
| 345 |
+
lora:
|
| 346 |
+
r: 8 # LoRA rank (higher = more capacity, more VRAM)
|
| 347 |
+
lora_alpha: 16 # Scaling factor (usually 2×r)
|
| 348 |
+
lora_dropout: 0.05 # Regularisation
|
| 349 |
+
target_modules: [q, v, k, o, wi_0, wi_1, wo] # All attention + FFN layers
|
| 350 |
+
|
| 351 |
+
training:
|
| 352 |
+
per_device_train_batch_size: 4
|
| 353 |
+
gradient_accumulation_steps: 8 # Effective batch = 32
|
| 354 |
+
learning_rate: 3.0e-4
|
| 355 |
+
lr_scheduler_type: cosine
|
| 356 |
+
bf16: true # Use bfloat16 on Ampere+ GPUs
|
| 357 |
+
|
| 358 |
+
loss:
|
| 359 |
+
lambda_style: 0.3 # Style preservation weight
|
| 360 |
+
lambda_semantic: 0.5 # Meaning preservation weight
|
| 361 |
+
lambda_human_pattern: 0.4 # Anti-AI penalty weight
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
+
### `configs/inference_config.yaml`
|
| 365 |
+
|
| 366 |
+
```yaml
|
| 367 |
+
model:
|
| 368 |
+
key: "flan-t5-small"
|
| 369 |
+
checkpoint_path: "checkpoints/best_model_merged"
|
| 370 |
+
use_lora: false # Merged model — no adapter needed
|
| 371 |
+
|
| 372 |
+
generation:
|
| 373 |
+
num_beams: 5 # Beam search width
|
| 374 |
+
length_penalty: 1.2 # > 1.0 rewards longer outputs
|
| 375 |
+
no_repeat_ngram_size: 3 # Prevents repetition
|
| 376 |
+
max_new_tokens: 128 # Must match training max_target_length
|
| 377 |
+
|
| 378 |
+
vocabulary:
|
| 379 |
+
semantic_threshold: 0.82 # Minimum cosine similarity for AWL substitution
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
---
|
| 383 |
+
|
| 384 |
+
## API Usage
|
| 385 |
+
|
| 386 |
+
```bash
|
| 387 |
+
# Start the server
|
| 388 |
+
PYTHONPATH=. python -m uvicorn src.api.main:app --host 0.0.0.0 --port 8000
|
| 389 |
+
|
| 390 |
+
# Correct text
|
| 391 |
+
curl -X POST http://localhost:8000/correct \
|
| 392 |
+
-H "Content-Type: application/json" \
|
| 393 |
+
-d '{"text": "The studnet recieved alot of informtion.", "style_alpha": 0.6}'
|
| 394 |
+
|
| 395 |
+
# Health check
|
| 396 |
+
curl http://localhost:8000/health
|
| 397 |
+
```
|
| 398 |
+
|
| 399 |
+
Interactive docs available at `http://localhost:8000/docs`.
|
| 400 |
+
|
| 401 |
+
---
|
| 402 |
+
|
| 403 |
+
## Hardware Requirements
|
| 404 |
+
|
| 405 |
+
| Tier | GPU | Model | Training Time |
|
| 406 |
+
|------|-----|-------|---------------|
|
| 407 |
+
| **Tested** | RTX 3050 4GB | Flan-T5-Small + LoRA | ~45 min (5 epochs) |
|
| 408 |
+
| Recommended | RTX 3090 24GB | Flan-T5-Base + LoRA | ~2h |
|
| 409 |
+
| Maximum | A100 80GB | Flan-T5-XL + LoRA | ~12h |
|
| 410 |
+
|
| 411 |
+
CPU inference is supported but significantly slower (~30s per correction vs ~2s on GPU).
|
| 412 |
+
|
| 413 |
+
---
|
| 414 |
+
|
| 415 |
+
## Data Sources
|
| 416 |
+
|
| 417 |
+
| Dataset | Type | Size | Source |
|
| 418 |
+
|---------|------|------|--------|
|
| 419 |
+
| FCE v2.1 | Learner errors + corrections | ~28k pairs | Cambridge English |
|
| 420 |
+
| W&I+LOCNESS v2.1 | Learner errors + corrections | ~34k pairs | BEA-2019 Shared Task |
|
| 421 |
+
| JFLEG | Fluency corrections (4 refs) | ~5k pairs | Johns Hopkins |
|
| 422 |
+
| Shanegerami AI_Human.csv | Human vs AI classification | ~50k samples | Kaggle |
|
| 423 |
+
| Starblasters8 data.parquet | Human vs AI classification | ~50k samples | Kaggle |
|
| 424 |
+
| Coxhead AWL | Academic Word List | 570 families / 549 headwords | Victoria University |
|
| 425 |
+
|
| 426 |
+
---
|
| 427 |
+
|
| 428 |
+
## Dyslexia Error Simulation
|
| 429 |
+
|
| 430 |
+
The `DyslexiaSimulator` generates synthetic training data based on research by Rello et al. (2013, 2017):
|
| 431 |
+
|
| 432 |
+
| Error Type | Frequency | Example |
|
| 433 |
+
|-----------|-----------|---------|
|
| 434 |
+
| Phonetic substitution | 35% | "because" → "becaus" |
|
| 435 |
+
| Letter transposition | 18% | "the" → "teh" |
|
| 436 |
+
| Letter omission | 16% | "important" → "importnt" |
|
| 437 |
+
| Letter doubling | 12% | "letter" → "lettter" |
|
| 438 |
+
| Letter reversal (b/d, p/q) | 10% | "bad" → "dad" |
|
| 439 |
+
| Word boundary errors | 9% | "a lot" → "alot" |
|
| 440 |
+
|
| 441 |
+
---
|
| 442 |
+
|
| 443 |
+
## Style Fingerprint Vector
|
| 444 |
+
|
| 445 |
+
The 512-dimensional style vector captures 41 raw features:
|
| 446 |
+
|
| 447 |
+
| Group | Features | Count |
|
| 448 |
+
|-------|----------|-------|
|
| 449 |
+
| Sentence stats | mean, std, skew of sentence lengths | 3 |
|
| 450 |
+
| Word stats | mean, std of word lengths | 2 |
|
| 451 |
+
| Lexical | type-token ratio, lexical density | 2 |
|
| 452 |
+
| Syntactic | passive/active voice ratio, subordinate clause ratio, avg dependency tree depth | 4 |
|
| 453 |
+
| Discourse | 20 academic discourse markers (per 100 words) | 20 |
|
| 454 |
+
| Register | hedging frequency, formality score, nominalization ratio | 3 |
|
| 455 |
+
| Readability | Flesch reading ease, avg syllables per word | 2 |
|
| 456 |
+
| Pronouns | first-person ratio, third-person ratio | 2 |
|
| 457 |
+
| Other | question ratio, exclamation ratio, AWL coverage | 3 |
|
| 458 |
+
|
| 459 |
+
These are projected through a 2-layer MLP (`41 → 256 → 512`) with LayerNorm and GELU activation, then L2-normalised.
|
| 460 |
+
|
| 461 |
+
---
|
| 462 |
+
|
| 463 |
+
## Known Limitations
|
| 464 |
+
|
| 465 |
+
1. **Model capacity**: Flan-T5-Small (77M params) has limited correction ability compared to larger models
|
| 466 |
+
2. **Training window**: 128-token max input means very long sentences may be split mid-clause
|
| 467 |
+
3. **Vocabulary elevation**: BERT fill-mask can suggest semantically inappropriate AWL words; the similarity threshold (0.82) is a trade-off between coverage and accuracy
|
| 468 |
+
4. **Already-correct text**: The model is trained on error→correction pairs; feeding it clean text produces unpredictable output
|
| 469 |
+
5. **LanguageTool latency**: Spell correction takes ~15-20s due to JVM startup on first call
|
configs/awl_config.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
awl:
|
| 2 |
+
primary: "data/awl/coxhead_awl.txt"
|
| 3 |
+
supplementary:
|
| 4 |
+
- "data/awl/domain_lexicons/humanities.txt"
|
| 5 |
+
- "data/awl/domain_lexicons/sciences.txt"
|
| 6 |
+
- "data/awl/domain_lexicons/social_sciences.txt"
|
| 7 |
+
academic_synonyms: "data/awl/academic_synonyms.json"
|
| 8 |
+
|
| 9 |
+
register:
|
| 10 |
+
expand_contractions: true
|
| 11 |
+
replace_colloquialisms: true
|
| 12 |
+
enforce_third_person_academic: false # Keep user's voice (don't force "one")
|
| 13 |
+
minimum_formality_score: 0.65
|
configs/inference_config.yaml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
key: "flan-t5-small"
|
| 3 |
+
checkpoint_path: "checkpoints/best_model_merged"
|
| 4 |
+
quantize: false
|
| 5 |
+
use_lora: false # Merged model — no adapter needed
|
| 6 |
+
model_hidden_dim: 512
|
| 7 |
+
|
| 8 |
+
style_conditioner:
|
| 9 |
+
style_dim: 512
|
| 10 |
+
n_prefix_tokens: 10
|
| 11 |
+
|
| 12 |
+
generation:
|
| 13 |
+
num_beams: 5
|
| 14 |
+
length_penalty: 1.2
|
| 15 |
+
no_repeat_ngram_size: 3
|
| 16 |
+
min_length: 5
|
| 17 |
+
max_new_tokens: 128
|
| 18 |
+
early_stopping: true
|
| 19 |
+
temperature: 0.7
|
| 20 |
+
do_sample: false
|
| 21 |
+
|
| 22 |
+
vocabulary:
|
| 23 |
+
awl_path: "data/awl/coxhead_awl.txt"
|
| 24 |
+
mlm_model: "bert-large-uncased"
|
| 25 |
+
sem_model: "all-mpnet-base-v2"
|
| 26 |
+
semantic_threshold: 0.82
|
| 27 |
+
|
| 28 |
+
api:
|
| 29 |
+
host: "0.0.0.0"
|
| 30 |
+
port: 8000
|
| 31 |
+
workers: 1
|
| 32 |
+
reload: false
|
configs/model_config.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
key: "flan-t5-xl"
|
| 3 |
+
checkpoint_path: "checkpoints/best_model"
|
| 4 |
+
quantize: false
|
| 5 |
+
use_lora: true
|
| 6 |
+
model_hidden_dim: 2048 # flan-t5-xl hidden size
|
| 7 |
+
# model_hidden_dim: 1024 # flan-t5-large
|
| 8 |
+
# model_hidden_dim: 1024 # bart-large
|
| 9 |
+
# model_hidden_dim: 4096 # llama-3.1-8b
|
| 10 |
+
|
| 11 |
+
style_conditioner:
|
| 12 |
+
style_dim: 512
|
| 13 |
+
n_prefix_tokens: 10
|
| 14 |
+
|
| 15 |
+
fingerprinter:
|
| 16 |
+
spacy_model: "en_core_web_trf"
|
| 17 |
+
awl_path: "data/awl/coxhead_awl.txt"
|
| 18 |
+
projection_hidden_dim: 256
|
| 19 |
+
projection_output_dim: 512
|
| 20 |
+
|
| 21 |
+
generation:
|
| 22 |
+
num_beams: 5
|
| 23 |
+
length_penalty: 1.0
|
| 24 |
+
no_repeat_ngram_size: 3
|
| 25 |
+
min_length: 10
|
| 26 |
+
max_new_tokens: 512
|
| 27 |
+
early_stopping: true
|
| 28 |
+
temperature: 0.7 # Slight randomness for naturalness
|
| 29 |
+
do_sample: false # Beam search by default
|
| 30 |
+
|
| 31 |
+
vocabulary:
|
| 32 |
+
awl_path: "data/awl/coxhead_awl.txt"
|
| 33 |
+
mlm_model: "bert-large-uncased"
|
| 34 |
+
sem_model: "all-mpnet-base-v2"
|
| 35 |
+
semantic_threshold: 0.82
|
configs/training_config.yaml
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
key: "flan-t5-small" # flan-t5-xl | flan-t5-large | flan-t5-base | flan-t5-small | bart-large | llama-3.1-8b
|
| 3 |
+
quantize: false
|
| 4 |
+
use_lora: true
|
| 5 |
+
|
| 6 |
+
lora:
|
| 7 |
+
r: 8
|
| 8 |
+
lora_alpha: 16
|
| 9 |
+
lora_dropout: 0.05
|
| 10 |
+
target_modules: ["q", "v", "k", "o", "wi_0", "wi_1", "wo"]
|
| 11 |
+
|
| 12 |
+
data:
|
| 13 |
+
train_path: "data/processed/train.jsonl"
|
| 14 |
+
val_path: "data/processed/val.jsonl"
|
| 15 |
+
test_path: "data/processed/test.jsonl"
|
| 16 |
+
max_input_length: 128
|
| 17 |
+
max_target_length: 128
|
| 18 |
+
augment_synthetic: true
|
| 19 |
+
synthetic_ratio: 0.3
|
| 20 |
+
|
| 21 |
+
training:
|
| 22 |
+
output_dir: "checkpoints/"
|
| 23 |
+
num_train_epochs: 5
|
| 24 |
+
per_device_train_batch_size: 4 # T5-Small in bf16 fits batch=4 in 4GB VRAM
|
| 25 |
+
per_device_eval_batch_size: 8
|
| 26 |
+
gradient_accumulation_steps: 8 # Effective batch = 4*8 = 32
|
| 27 |
+
learning_rate: 3.0e-4
|
| 28 |
+
lr_scheduler_type: "cosine"
|
| 29 |
+
warmup_ratio: 0.05
|
| 30 |
+
weight_decay: 0.01
|
| 31 |
+
fp16: false
|
| 32 |
+
bf16: true # Use bfloat16 on Ampere+ GPUs
|
| 33 |
+
evaluation_strategy: "steps"
|
| 34 |
+
eval_steps: 100
|
| 35 |
+
save_strategy: "steps"
|
| 36 |
+
save_steps: 100
|
| 37 |
+
save_total_limit: 3
|
| 38 |
+
load_best_model_at_end: true
|
| 39 |
+
metric_for_best_model: "eval_loss"
|
| 40 |
+
greater_is_better: false
|
| 41 |
+
logging_dir: "logs/"
|
| 42 |
+
logging_steps: 25
|
| 43 |
+
report_to: ["wandb", "tensorboard"]
|
| 44 |
+
dataloader_num_workers: 0 # Python 3.14 forkserver breaks with workers > 0
|
| 45 |
+
seed: 42
|
| 46 |
+
push_to_hub: false
|
| 47 |
+
|
| 48 |
+
loss:
|
| 49 |
+
lambda_style: 0.3
|
| 50 |
+
lambda_semantic: 0.5
|
| 51 |
+
lambda_human_pattern: 0.4 # Human pattern reward weight
|
| 52 |
+
sem_model_name: "all-mpnet-base-v2"
|
| 53 |
+
|
| 54 |
+
generation:
|
| 55 |
+
num_beams: 5
|
| 56 |
+
length_penalty: 1.0
|
| 57 |
+
no_repeat_ngram_size: 3
|
| 58 |
+
min_length: 10
|
| 59 |
+
max_new_tokens: 512
|
| 60 |
+
early_stopping: true
|
| 61 |
+
|
| 62 |
+
human_pattern:
|
| 63 |
+
classifier_path: "checkpoints/human_pattern_classifier.pt"
|
| 64 |
+
shanegerami_path: "data/raw/shanegerami/AI_Human.csv"
|
| 65 |
+
starblasters_path: "data/raw/starblasters8/data.parquet"
|
| 66 |
+
max_samples_per_source: 50000
|
| 67 |
+
pretrain_epochs: 20
|
| 68 |
+
pretrain_lr: 1.0e-3
|
| 69 |
+
pretrain_batch_size: 512
|
| 70 |
+
target_auc: 0.88
|
configs/training_config_fast.yaml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 2 |
+
# training_config_fast.yaml — Optimised for RTX 3050 (4GB) + T5-Small
|
| 3 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 4 |
+
# Usage: python scripts/train.py --config configs/training_config_fast.yaml
|
| 5 |
+
#
|
| 6 |
+
# Compared to training_config.yaml, this config:
|
| 7 |
+
# - Uses T5-Small (77M params vs 248M for Base)
|
| 8 |
+
# - Lower LoRA rank for fewer trainable params
|
| 9 |
+
# - Larger batch + less accumulation for throughput
|
| 10 |
+
# - More epochs since each is fast
|
| 11 |
+
# - More frequent logging/eval for tighter feedback loop
|
| 12 |
+
|
| 13 |
+
model:
|
| 14 |
+
key: "flan-t5-small"
|
| 15 |
+
quantize: false
|
| 16 |
+
use_lora: true
|
| 17 |
+
|
| 18 |
+
lora:
|
| 19 |
+
r: 8
|
| 20 |
+
lora_alpha: 16
|
| 21 |
+
lora_dropout: 0.05
|
| 22 |
+
target_modules: ["q", "v", "k", "o", "wi_0", "wi_1", "wo"]
|
| 23 |
+
|
| 24 |
+
data:
|
| 25 |
+
train_path: "data/processed/train.jsonl"
|
| 26 |
+
val_path: "data/processed/val.jsonl"
|
| 27 |
+
test_path: "data/processed/test.jsonl"
|
| 28 |
+
max_input_length: 128
|
| 29 |
+
max_target_length: 128
|
| 30 |
+
augment_synthetic: true
|
| 31 |
+
synthetic_ratio: 0.3
|
| 32 |
+
|
| 33 |
+
training:
|
| 34 |
+
output_dir: "checkpoints/"
|
| 35 |
+
num_train_epochs: 5
|
| 36 |
+
per_device_train_batch_size: 4
|
| 37 |
+
per_device_eval_batch_size: 8
|
| 38 |
+
gradient_accumulation_steps: 8 # Effective batch = 32
|
| 39 |
+
learning_rate: 3.0e-4
|
| 40 |
+
lr_scheduler_type: "cosine"
|
| 41 |
+
warmup_ratio: 0.05
|
| 42 |
+
weight_decay: 0.01
|
| 43 |
+
fp16: false
|
| 44 |
+
bf16: true
|
| 45 |
+
evaluation_strategy: "steps"
|
| 46 |
+
eval_steps: 100
|
| 47 |
+
save_strategy: "steps"
|
| 48 |
+
save_steps: 100
|
| 49 |
+
save_total_limit: 3
|
| 50 |
+
load_best_model_at_end: true
|
| 51 |
+
metric_for_best_model: "eval_loss"
|
| 52 |
+
greater_is_better: false
|
| 53 |
+
logging_dir: "logs/"
|
| 54 |
+
logging_steps: 25
|
| 55 |
+
report_to: ["tensorboard"] # Skip W&B for max speed
|
| 56 |
+
dataloader_num_workers: 0 # Python 3.14 forkserver breaks with workers > 0
|
| 57 |
+
seed: 42
|
| 58 |
+
push_to_hub: false
|
| 59 |
+
|
| 60 |
+
loss:
|
| 61 |
+
lambda_style: 0.3
|
| 62 |
+
lambda_semantic: 0.5
|
| 63 |
+
lambda_human_pattern: 0.4
|
| 64 |
+
sem_model_name: "all-mpnet-base-v2"
|
| 65 |
+
|
| 66 |
+
generation:
|
| 67 |
+
num_beams: 5
|
| 68 |
+
length_penalty: 1.0
|
| 69 |
+
no_repeat_ngram_size: 3
|
| 70 |
+
min_length: 10
|
| 71 |
+
max_new_tokens: 512
|
| 72 |
+
early_stopping: true
|
| 73 |
+
|
| 74 |
+
human_pattern:
|
| 75 |
+
classifier_path: "checkpoints/human_pattern_classifier.pt"
|
| 76 |
+
shanegerami_path: "data/raw/shanegerami/AI_Human.csv"
|
| 77 |
+
starblasters_path: "data/raw/starblasters8/data.parquet"
|
| 78 |
+
max_samples_per_source: 50000
|
| 79 |
+
pretrain_epochs: 20
|
| 80 |
+
pretrain_lr: 1.0e-3
|
| 81 |
+
pretrain_batch_size: 512
|
| 82 |
+
target_auc: 0.88
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.8"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
api:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "8000:8000"
|
| 8 |
+
volumes:
|
| 9 |
+
- ./configs:/app/configs
|
| 10 |
+
- ./data:/app/data
|
| 11 |
+
- ./checkpoints:/app/checkpoints
|
| 12 |
+
environment:
|
| 13 |
+
- CUDA_VISIBLE_DEVICES=0
|
| 14 |
+
deploy:
|
| 15 |
+
resources:
|
| 16 |
+
reservations:
|
| 17 |
+
devices:
|
| 18 |
+
- driver: nvidia
|
| 19 |
+
count: 1
|
| 20 |
+
capabilities: [gpu]
|
| 21 |
+
restart: unless-stopped
|
graph_codebase.py
ADDED
|
@@ -0,0 +1,859 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
graphify_rebuild.py — One-shot NudR knowledge graph regeneration.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python graphify_rebuild.py # Full rebuild
|
| 7 |
+
python graphify_rebuild.py --watch # Watch mode (rebuilds on file change)
|
| 8 |
+
python graphify_rebuild.py --quick # Skip semantic, AST-only rebuild
|
| 9 |
+
|
| 10 |
+
Outputs (all in graphify-out/):
|
| 11 |
+
GRAPH_REPORT.md — Full community/audit report
|
| 12 |
+
graph.html — Interactive force-directed graph (open in browser)
|
| 13 |
+
graph.json — Raw graph data for tooling
|
| 14 |
+
manifest.json — File hashes for incremental re-runs
|
| 15 |
+
cost.json — Token usage tracking
|
| 16 |
+
"""
|
| 17 |
+
import sys, io, os, json, ast, hashlib, time, argparse
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from datetime import datetime, timezone
|
| 20 |
+
|
| 21 |
+
# Fix Windows console encoding
|
| 22 |
+
if sys.platform == 'win32':
|
| 23 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
| 24 |
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
|
| 25 |
+
|
| 26 |
+
# ─── Configuration ───────────────────────────────────────────────────────────
|
| 27 |
+
ROOT = Path(__file__).parent
|
| 28 |
+
OUT_DIR = ROOT / 'graphify-out'
|
| 29 |
+
CACHE_DIR = OUT_DIR / 'cache'
|
| 30 |
+
MANIFEST = OUT_DIR / 'manifest.json'
|
| 31 |
+
REPORT_PATH = OUT_DIR / 'GRAPH_REPORT.md'
|
| 32 |
+
HTML_PATH = OUT_DIR / 'graph.html'
|
| 33 |
+
JSON_PATH = OUT_DIR / 'graph.json'
|
| 34 |
+
COST_PATH = OUT_DIR / 'cost.json'
|
| 35 |
+
|
| 36 |
+
# Directories and patterns to skip
|
| 37 |
+
SKIP_DIRS = {
|
| 38 |
+
'.git', '.venv', 'venv', 'node_modules', '__pycache__', '.mypy_cache',
|
| 39 |
+
'.pytest_cache', '.graphify', 'graphify-out', '.terraform', '.idea',
|
| 40 |
+
'env', 'dist', 'build', 'egg-info', '.tox', '.ruff_cache',
|
| 41 |
+
}
|
| 42 |
+
SKIP_EXTENSIONS = {'.pyc', '.pyo', '.whl', '.egg', '.so', '.dll', '.exe'}
|
| 43 |
+
|
| 44 |
+
# File types for AST extraction
|
| 45 |
+
AST_EXTENSIONS = {'.py'}
|
| 46 |
+
|
| 47 |
+
# File types for corpus (semantic awareness)
|
| 48 |
+
CORPUS_EXTENSIONS = {
|
| 49 |
+
'.py', '.md', '.txt', '.html', '.css', '.js', '.ts', '.json',
|
| 50 |
+
'.yaml', '.yml', '.toml', '.cfg', '.ini', '.proto', '.tf', '.tfvars',
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ─── Step 1: Detect files ────────────────────────────────────────────────────
|
| 55 |
+
def detect_files():
|
| 56 |
+
"""Walk the project and return list of relevant files with metadata."""
|
| 57 |
+
files = []
|
| 58 |
+
total_words = 0
|
| 59 |
+
for dirpath, dirnames, filenames in os.walk(ROOT):
|
| 60 |
+
# Prune skipped directories
|
| 61 |
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
| 62 |
+
for fname in filenames:
|
| 63 |
+
fpath = Path(dirpath) / fname
|
| 64 |
+
ext = fpath.suffix.lower()
|
| 65 |
+
if ext in SKIP_EXTENSIONS:
|
| 66 |
+
continue
|
| 67 |
+
rel = fpath.relative_to(ROOT)
|
| 68 |
+
if any(part.startswith('.') for part in rel.parts[:-1]):
|
| 69 |
+
continue
|
| 70 |
+
try:
|
| 71 |
+
mtime = fpath.stat().st_mtime
|
| 72 |
+
size = fpath.stat().st_size
|
| 73 |
+
except OSError:
|
| 74 |
+
continue
|
| 75 |
+
if ext in CORPUS_EXTENSIONS and size < 5_000_000:
|
| 76 |
+
try:
|
| 77 |
+
content = fpath.read_text(encoding='utf-8', errors='ignore')
|
| 78 |
+
word_count = len(content.split())
|
| 79 |
+
total_words += word_count
|
| 80 |
+
except Exception:
|
| 81 |
+
word_count = 0
|
| 82 |
+
else:
|
| 83 |
+
word_count = 0
|
| 84 |
+
files.append({
|
| 85 |
+
'path': str(rel),
|
| 86 |
+
'ext': ext,
|
| 87 |
+
'mtime': mtime,
|
| 88 |
+
'size': size,
|
| 89 |
+
'words': word_count,
|
| 90 |
+
})
|
| 91 |
+
return files, total_words
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def get_changed_files(files):
|
| 95 |
+
"""Compare against manifest to find changed files."""
|
| 96 |
+
if MANIFEST.exists():
|
| 97 |
+
old_manifest = json.loads(MANIFEST.read_text(encoding='utf-8'))
|
| 98 |
+
else:
|
| 99 |
+
old_manifest = {}
|
| 100 |
+
changed = []
|
| 101 |
+
for f in files:
|
| 102 |
+
old_mtime = old_manifest.get(f['path'])
|
| 103 |
+
if old_mtime is None or f['mtime'] != old_mtime:
|
| 104 |
+
changed.append(f)
|
| 105 |
+
return changed
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# ─── Step 2: AST Extraction ──────────────────────────────────────────────────
|
| 109 |
+
def hash_file(path):
|
| 110 |
+
"""SHA-256 hash for cache keying."""
|
| 111 |
+
h = hashlib.sha256()
|
| 112 |
+
try:
|
| 113 |
+
h.update(Path(path).read_bytes())
|
| 114 |
+
except Exception:
|
| 115 |
+
h.update(path.encode())
|
| 116 |
+
return h.hexdigest()
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def extract_ast_file(filepath):
|
| 120 |
+
"""Extract AST nodes and edges from a single Python file."""
|
| 121 |
+
nodes = []
|
| 122 |
+
edges = []
|
| 123 |
+
rel = str(filepath.relative_to(ROOT))
|
| 124 |
+
file_id = rel.replace('\\', '_').replace('/', '_').replace('.', '_')
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
| 128 |
+
tree = ast.parse(source, filename=str(filepath))
|
| 129 |
+
except SyntaxError:
|
| 130 |
+
return nodes, edges
|
| 131 |
+
|
| 132 |
+
# File-level node
|
| 133 |
+
nodes.append({
|
| 134 |
+
'id': file_id,
|
| 135 |
+
'label': filepath.name,
|
| 136 |
+
'file_type': 'code',
|
| 137 |
+
'source_file': rel,
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
# Extract module-level docstring
|
| 141 |
+
docstring = ast.get_docstring(tree)
|
| 142 |
+
if docstring and len(docstring) > 20:
|
| 143 |
+
doc_id = f"{file_id}_docstring"
|
| 144 |
+
nodes.append({
|
| 145 |
+
'id': doc_id,
|
| 146 |
+
'label': docstring[:80],
|
| 147 |
+
'file_type': 'rationale',
|
| 148 |
+
'source_file': rel,
|
| 149 |
+
})
|
| 150 |
+
edges.append({
|
| 151 |
+
'source': file_id, 'target': doc_id,
|
| 152 |
+
'relation': 'has_rationale',
|
| 153 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 154 |
+
'source_file': rel, 'weight': 0.5,
|
| 155 |
+
})
|
| 156 |
+
|
| 157 |
+
for node in ast.walk(tree):
|
| 158 |
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 159 |
+
func_id = f"{file_id}_{node.name}"
|
| 160 |
+
label = f"{node.name}()"
|
| 161 |
+
nodes.append({
|
| 162 |
+
'id': func_id,
|
| 163 |
+
'label': label,
|
| 164 |
+
'file_type': 'code',
|
| 165 |
+
'source_file': rel,
|
| 166 |
+
'source_location': f"line {node.lineno}",
|
| 167 |
+
})
|
| 168 |
+
edges.append({
|
| 169 |
+
'source': file_id, 'target': func_id,
|
| 170 |
+
'relation': 'defines',
|
| 171 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 172 |
+
'source_file': rel, 'weight': 1.0,
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
# Function docstring
|
| 176 |
+
fdoc = ast.get_docstring(node)
|
| 177 |
+
if fdoc and len(fdoc) > 20:
|
| 178 |
+
fdoc_id = f"{func_id}_doc"
|
| 179 |
+
nodes.append({
|
| 180 |
+
'id': fdoc_id,
|
| 181 |
+
'label': fdoc[:80],
|
| 182 |
+
'file_type': 'rationale',
|
| 183 |
+
'source_file': rel,
|
| 184 |
+
'source_location': f"line {node.lineno}",
|
| 185 |
+
})
|
| 186 |
+
edges.append({
|
| 187 |
+
'source': func_id, 'target': fdoc_id,
|
| 188 |
+
'relation': 'has_rationale',
|
| 189 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 190 |
+
'source_file': rel, 'weight': 0.5,
|
| 191 |
+
})
|
| 192 |
+
|
| 193 |
+
# Calls inside functions
|
| 194 |
+
for child in ast.walk(node):
|
| 195 |
+
if isinstance(child, ast.Call):
|
| 196 |
+
callee = _get_call_name(child)
|
| 197 |
+
if callee:
|
| 198 |
+
edges.append({
|
| 199 |
+
'source': func_id,
|
| 200 |
+
'target': callee,
|
| 201 |
+
'relation': 'calls',
|
| 202 |
+
'confidence': 'INFERRED', 'confidence_score': 0.7,
|
| 203 |
+
'source_file': rel, 'weight': 0.8,
|
| 204 |
+
})
|
| 205 |
+
|
| 206 |
+
elif isinstance(node, ast.ClassDef):
|
| 207 |
+
class_id = f"{file_id}_{node.name}"
|
| 208 |
+
nodes.append({
|
| 209 |
+
'id': class_id,
|
| 210 |
+
'label': node.name,
|
| 211 |
+
'file_type': 'code',
|
| 212 |
+
'source_file': rel,
|
| 213 |
+
'source_location': f"line {node.lineno}",
|
| 214 |
+
})
|
| 215 |
+
edges.append({
|
| 216 |
+
'source': file_id, 'target': class_id,
|
| 217 |
+
'relation': 'defines',
|
| 218 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 219 |
+
'source_file': rel, 'weight': 1.0,
|
| 220 |
+
})
|
| 221 |
+
|
| 222 |
+
# Class docstring
|
| 223 |
+
cdoc = ast.get_docstring(node)
|
| 224 |
+
if cdoc and len(cdoc) > 20:
|
| 225 |
+
cdoc_id = f"{class_id}_doc"
|
| 226 |
+
nodes.append({
|
| 227 |
+
'id': cdoc_id,
|
| 228 |
+
'label': cdoc[:80],
|
| 229 |
+
'file_type': 'rationale',
|
| 230 |
+
'source_file': rel,
|
| 231 |
+
'source_location': f"line {node.lineno}",
|
| 232 |
+
})
|
| 233 |
+
edges.append({
|
| 234 |
+
'source': class_id, 'target': cdoc_id,
|
| 235 |
+
'relation': 'has_rationale',
|
| 236 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 237 |
+
'source_file': rel, 'weight': 0.5,
|
| 238 |
+
})
|
| 239 |
+
|
| 240 |
+
# Base classes
|
| 241 |
+
for base in node.bases:
|
| 242 |
+
base_name = _get_name(base)
|
| 243 |
+
if base_name:
|
| 244 |
+
edges.append({
|
| 245 |
+
'source': class_id, 'target': base_name,
|
| 246 |
+
'relation': 'inherits',
|
| 247 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 248 |
+
'source_file': rel, 'weight': 1.0,
|
| 249 |
+
})
|
| 250 |
+
|
| 251 |
+
elif isinstance(node, ast.Import):
|
| 252 |
+
for alias in node.names:
|
| 253 |
+
edges.append({
|
| 254 |
+
'source': file_id, 'target': alias.name,
|
| 255 |
+
'relation': 'imports',
|
| 256 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 257 |
+
'source_file': rel, 'weight': 0.6,
|
| 258 |
+
})
|
| 259 |
+
|
| 260 |
+
elif isinstance(node, ast.ImportFrom) and node.module:
|
| 261 |
+
edges.append({
|
| 262 |
+
'source': file_id, 'target': node.module,
|
| 263 |
+
'relation': 'imports',
|
| 264 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 265 |
+
'source_file': rel, 'weight': 0.6,
|
| 266 |
+
})
|
| 267 |
+
|
| 268 |
+
return nodes, edges
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def _get_call_name(node):
|
| 272 |
+
"""Extract callable name from ast.Call node."""
|
| 273 |
+
if isinstance(node.func, ast.Name):
|
| 274 |
+
return node.func.id
|
| 275 |
+
elif isinstance(node.func, ast.Attribute):
|
| 276 |
+
return node.func.attr
|
| 277 |
+
return None
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def _get_name(node):
|
| 281 |
+
"""Extract name from various AST node types."""
|
| 282 |
+
if isinstance(node, ast.Name):
|
| 283 |
+
return node.id
|
| 284 |
+
elif isinstance(node, ast.Attribute):
|
| 285 |
+
return node.attr
|
| 286 |
+
return None
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def _resolve_edges(all_nodes, all_edges):
|
| 290 |
+
"""Post-process edges to resolve bare names to actual node IDs.
|
| 291 |
+
|
| 292 |
+
The per-file AST extraction produces edges with bare targets:
|
| 293 |
+
- calls: target='get_cached_image' (bare function name)
|
| 294 |
+
- imports: target='app.core.session' (dotted module path)
|
| 295 |
+
|
| 296 |
+
This function resolves them to actual node IDs so they survive
|
| 297 |
+
the graph build phase (which drops unresolvable targets).
|
| 298 |
+
"""
|
| 299 |
+
node_ids = {n['id'] for n in all_nodes}
|
| 300 |
+
|
| 301 |
+
# Build function name → [node_id, ...] index
|
| 302 |
+
func_index: dict[str, list[str]] = {}
|
| 303 |
+
for n in all_nodes:
|
| 304 |
+
if n.get('file_type') == 'code' and '(' in n.get('label', ''):
|
| 305 |
+
# label looks like "get_cached_image()"
|
| 306 |
+
bare_name = n['label'].rstrip('()')
|
| 307 |
+
func_index.setdefault(bare_name, []).append(n['id'])
|
| 308 |
+
|
| 309 |
+
# Build module path → file node ID map
|
| 310 |
+
# e.g. 'app.core.session' → 'app_core_session_py'
|
| 311 |
+
module_index: dict[str, str] = {}
|
| 312 |
+
for n in all_nodes:
|
| 313 |
+
src = n.get('source_file', '')
|
| 314 |
+
if src.endswith('.py'):
|
| 315 |
+
# Convert 'app/core/session.py' or 'app\core\session.py'
|
| 316 |
+
# → dotted module: 'app.core.session'
|
| 317 |
+
mod_path = src.replace('\\', '/').replace('/', '.').removesuffix('.py')
|
| 318 |
+
# Strip leading __init__ for package imports
|
| 319 |
+
mod_path_init = mod_path.removesuffix('.__init__')
|
| 320 |
+
nid = n['id']
|
| 321 |
+
# Only map file-level nodes (no functions/classes)
|
| 322 |
+
if nid == src.replace('\\', '_').replace('/', '_').replace('.', '_'):
|
| 323 |
+
module_index[mod_path] = nid
|
| 324 |
+
if mod_path != mod_path_init:
|
| 325 |
+
module_index[mod_path_init] = nid
|
| 326 |
+
|
| 327 |
+
resolved_edges = []
|
| 328 |
+
calls_resolved = 0
|
| 329 |
+
imports_resolved = 0
|
| 330 |
+
dropped = 0
|
| 331 |
+
|
| 332 |
+
for edge in all_edges:
|
| 333 |
+
rel = edge.get('relation', '')
|
| 334 |
+
|
| 335 |
+
if rel == 'calls':
|
| 336 |
+
target = edge['target']
|
| 337 |
+
# Try exact match first
|
| 338 |
+
if target in node_ids:
|
| 339 |
+
resolved_edges.append(edge)
|
| 340 |
+
calls_resolved += 1
|
| 341 |
+
continue
|
| 342 |
+
# Resolve via function index
|
| 343 |
+
matches = func_index.get(target, [])
|
| 344 |
+
if matches:
|
| 345 |
+
for match_id in matches:
|
| 346 |
+
# Don't create self-edges within the same file
|
| 347 |
+
if match_id.rsplit('_', 1)[0] != edge['source'].rsplit('_', 1)[0] or len(matches) == 1:
|
| 348 |
+
resolved_edges.append({
|
| 349 |
+
**edge,
|
| 350 |
+
'target': match_id,
|
| 351 |
+
'confidence': 'INFERRED' if len(matches) > 1 else 'EXTRACTED',
|
| 352 |
+
'confidence_score': 0.9 if len(matches) == 1 else 0.6,
|
| 353 |
+
})
|
| 354 |
+
calls_resolved += 1
|
| 355 |
+
else:
|
| 356 |
+
dropped += 1
|
| 357 |
+
|
| 358 |
+
elif rel == 'imports':
|
| 359 |
+
target = edge['target']
|
| 360 |
+
# Try exact match as node ID first
|
| 361 |
+
if target in node_ids:
|
| 362 |
+
resolved_edges.append(edge)
|
| 363 |
+
imports_resolved += 1
|
| 364 |
+
continue
|
| 365 |
+
# Resolve dotted module path to file node ID
|
| 366 |
+
resolved_id = module_index.get(target)
|
| 367 |
+
if resolved_id:
|
| 368 |
+
resolved_edges.append({**edge, 'target': resolved_id})
|
| 369 |
+
imports_resolved += 1
|
| 370 |
+
continue
|
| 371 |
+
# Try progressively shorter prefixes
|
| 372 |
+
# e.g. 'app.core.session.revoke_all' → 'app.core.session' → 'app.core' → 'app'
|
| 373 |
+
parts = target.split('.')
|
| 374 |
+
found = False
|
| 375 |
+
for i in range(len(parts) - 1, 0, -1):
|
| 376 |
+
prefix = '.'.join(parts[:i])
|
| 377 |
+
resolved_id = module_index.get(prefix)
|
| 378 |
+
if resolved_id:
|
| 379 |
+
resolved_edges.append({**edge, 'target': resolved_id})
|
| 380 |
+
imports_resolved += 1
|
| 381 |
+
found = True
|
| 382 |
+
break
|
| 383 |
+
if not found:
|
| 384 |
+
# External/stdlib import — drop it
|
| 385 |
+
dropped += 1
|
| 386 |
+
|
| 387 |
+
else:
|
| 388 |
+
# defines, has_rationale, etc — keep as-is
|
| 389 |
+
resolved_edges.append(edge)
|
| 390 |
+
|
| 391 |
+
print(f" Resolved: {calls_resolved} calls, {imports_resolved} imports, {dropped} dropped (external/stdlib)")
|
| 392 |
+
return resolved_edges
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def run_ast_extraction(files, use_cache=True):
|
| 396 |
+
"""Run AST extraction on all Python files, with caching."""
|
| 397 |
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 398 |
+
all_nodes = []
|
| 399 |
+
all_edges = []
|
| 400 |
+
cached, extracted = 0, 0
|
| 401 |
+
|
| 402 |
+
# Collect valid cache hashes for cleanup
|
| 403 |
+
valid_hashes = set()
|
| 404 |
+
py_files = [f for f in files if f['ext'] in AST_EXTENSIONS]
|
| 405 |
+
for f in py_files:
|
| 406 |
+
fpath = ROOT / f['path']
|
| 407 |
+
fhash = hash_file(fpath)
|
| 408 |
+
valid_hashes.add(fhash)
|
| 409 |
+
cache_file = CACHE_DIR / f"{fhash}.json"
|
| 410 |
+
|
| 411 |
+
if use_cache and cache_file.exists():
|
| 412 |
+
data = json.loads(cache_file.read_text(encoding='utf-8'))
|
| 413 |
+
all_nodes.extend(data.get('nodes', []))
|
| 414 |
+
all_edges.extend(data.get('edges', []))
|
| 415 |
+
cached += 1
|
| 416 |
+
else:
|
| 417 |
+
nodes, edges = extract_ast_file(fpath)
|
| 418 |
+
all_nodes.extend(nodes)
|
| 419 |
+
all_edges.extend(edges)
|
| 420 |
+
# Write cache
|
| 421 |
+
cache_file.write_text(json.dumps({
|
| 422 |
+
'nodes': nodes, 'edges': edges,
|
| 423 |
+
}, indent=2), encoding='utf-8')
|
| 424 |
+
extracted += 1
|
| 425 |
+
|
| 426 |
+
# Clean stale cache entries
|
| 427 |
+
stale = 0
|
| 428 |
+
for cache_file in CACHE_DIR.glob('*.json'):
|
| 429 |
+
h = cache_file.stem
|
| 430 |
+
if h not in valid_hashes:
|
| 431 |
+
cache_file.unlink()
|
| 432 |
+
stale += 1
|
| 433 |
+
|
| 434 |
+
print(f" AST: {len(py_files)} Python files ({cached} cached, {extracted} extracted)")
|
| 435 |
+
if stale:
|
| 436 |
+
print(f" Cache cleanup: {stale} stale entries removed")
|
| 437 |
+
print(f" AST: {len(all_nodes)} nodes, {len(all_edges)} edges (raw)")
|
| 438 |
+
|
| 439 |
+
# Resolve bare targets to actual node IDs
|
| 440 |
+
all_edges = _resolve_edges(all_nodes, all_edges)
|
| 441 |
+
print(f" AST: {len(all_nodes)} nodes, {len(all_edges)} edges (resolved)")
|
| 442 |
+
return all_nodes, all_edges
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
# ─── Step 3: Semantic Extraction ─────────────────────────────────────────────
|
| 446 |
+
def build_semantic_nodes():
|
| 447 |
+
"""
|
| 448 |
+
Build semantic nodes from documentation files.
|
| 449 |
+
These capture high-level architecture concepts that AST can't see.
|
| 450 |
+
"""
|
| 451 |
+
nodes = []
|
| 452 |
+
edges = []
|
| 453 |
+
hyperedges = []
|
| 454 |
+
|
| 455 |
+
# Architecture components from README
|
| 456 |
+
arch_nodes = [
|
| 457 |
+
("nudr_api", "NudR Stateless API", "README.md"),
|
| 458 |
+
("fastapi_backend", "FastAPI Stateless Backend", "README.md"),
|
| 459 |
+
("supabase_db", "Supabase PostgreSQL Database", "README.md"),
|
| 460 |
+
("redis_cache", "Redis Session & Cache Store", "README.md"),
|
| 461 |
+
("cloudflare_proxy", "Cloudflare Edge Proxy", "README.md"),
|
| 462 |
+
("stripe_payments", "Stripe Payment Integration", "README.md"),
|
| 463 |
+
("firebase_fcm", "Firebase FCM Push Notifications", "README.md"),
|
| 464 |
+
("e2ee_encryption", "E2EE X25519 Key Exchange", "README.md"),
|
| 465 |
+
("protobuf_framing", "Protobuf Binary WebSocket Framing", "README.md"),
|
| 466 |
+
("hmac_verification", "HMAC-SHA256 Request Verification", "README.md"),
|
| 467 |
+
("origin_secret", "X-Origin-Secret Middleware", "README.md"),
|
| 468 |
+
("pow_challenge", "Proof-of-Work Challenge", "README.md"),
|
| 469 |
+
("rate_limiting", "Per-IP Rate Limiting", "README.md"),
|
| 470 |
+
("aws_secrets", "AWS Secrets Manager Integration", "README.md"),
|
| 471 |
+
("terraform_infra", "Terraform AWS Infrastructure", "README.md"),
|
| 472 |
+
("vpc_network", "VPC Network Topology", "README.md"),
|
| 473 |
+
("alb_autoscaling", "ALB + Auto Scaling Group", "README.md"),
|
| 474 |
+
("lambda_rotator", "Lambda Origin Secret Rotator", "README.md"),
|
| 475 |
+
("unified_ws", "Unified WebSocket Endpoint /ws", "README.md"),
|
| 476 |
+
("feed_ws", "Feed WebSocket Channel", "README.md"),
|
| 477 |
+
("chat_ws", "Chat WebSocket Channel", "README.md"),
|
| 478 |
+
("keysync_ws", "Keysync WebSocket Channel", "README.md"),
|
| 479 |
+
("discovery_ws", "Discovery WebSocket Channel", "README.md"),
|
| 480 |
+
("attack_detection", "Attack Detection & IP Risk Management", "README.md"),
|
| 481 |
+
]
|
| 482 |
+
|
| 483 |
+
for nid, label, src in arch_nodes:
|
| 484 |
+
nodes.append({
|
| 485 |
+
'id': f"sem_{nid}", 'label': label,
|
| 486 |
+
'file_type': 'document', 'source_file': src,
|
| 487 |
+
})
|
| 488 |
+
|
| 489 |
+
# Architecture edges
|
| 490 |
+
arch_edges = [
|
| 491 |
+
("nudr_api", "fastapi_backend", "implements"),
|
| 492 |
+
("fastapi_backend", "supabase_db", "references"),
|
| 493 |
+
("fastapi_backend", "redis_cache", "references"),
|
| 494 |
+
("cloudflare_proxy", "origin_secret", "references"),
|
| 495 |
+
("origin_secret", "lambda_rotator", "references"),
|
| 496 |
+
("stripe_payments", "fastapi_backend", "references"),
|
| 497 |
+
("firebase_fcm", "fastapi_backend", "references"),
|
| 498 |
+
("e2ee_encryption", "keysync_ws", "references"),
|
| 499 |
+
("protobuf_framing", "unified_ws", "references"),
|
| 500 |
+
("terraform_infra", "vpc_network", "references"),
|
| 501 |
+
("terraform_infra", "alb_autoscaling", "references"),
|
| 502 |
+
("terraform_infra", "aws_secrets", "references"),
|
| 503 |
+
("attack_detection", "rate_limiting", "references"),
|
| 504 |
+
("unified_ws", "feed_ws", "conceptually_related_to"),
|
| 505 |
+
("unified_ws", "chat_ws", "conceptually_related_to"),
|
| 506 |
+
("unified_ws", "keysync_ws", "conceptually_related_to"),
|
| 507 |
+
("unified_ws", "discovery_ws", "conceptually_related_to"),
|
| 508 |
+
]
|
| 509 |
+
|
| 510 |
+
for src, tgt, rel in arch_edges:
|
| 511 |
+
edges.append({
|
| 512 |
+
'source': f"sem_{src}", 'target': f"sem_{tgt}",
|
| 513 |
+
'relation': rel,
|
| 514 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 515 |
+
'source_file': 'README.md', 'weight': 1.0,
|
| 516 |
+
})
|
| 517 |
+
|
| 518 |
+
# Feed system nodes (from feed_system_documentation.md)
|
| 519 |
+
feed_nodes = [
|
| 520 |
+
("feed_system", "Feed System Technical Documentation", "PLAN/feed_system_documentation.md"),
|
| 521 |
+
("feed_scoring", "Multi-Factor Scoring Algorithm", "PLAN/feed_system_documentation.md"),
|
| 522 |
+
("feed_pool", "Feed Pool Computation Pipeline", "PLAN/feed_system_documentation.md"),
|
| 523 |
+
("feed_filters", "Feed Hard Filters (12 Rules)", "PLAN/feed_system_documentation.md"),
|
| 524 |
+
("feed_heatmap", "Preference Heatmap (Learned AI)", "PLAN/feed_system_documentation.md"),
|
| 525 |
+
("feed_reciprocal", "Reciprocal Boost & Injection", "PLAN/feed_system_documentation.md"),
|
| 526 |
+
("feed_gradient", "3-Tier Gradient Distribution", "PLAN/feed_system_documentation.md"),
|
| 527 |
+
("feed_redis", "Feed Redis Key Schema", "PLAN/feed_system_documentation.md"),
|
| 528 |
+
]
|
| 529 |
+
|
| 530 |
+
for nid, label, src in feed_nodes:
|
| 531 |
+
nodes.append({
|
| 532 |
+
'id': f"sem_{nid}", 'label': label,
|
| 533 |
+
'file_type': 'document', 'source_file': src,
|
| 534 |
+
})
|
| 535 |
+
|
| 536 |
+
feed_edges = [
|
| 537 |
+
("feed_system", "nudr_api", "references"),
|
| 538 |
+
("feed_pool", "redis_cache", "references"),
|
| 539 |
+
("feed_pool", "supabase_db", "references"),
|
| 540 |
+
("feed_scoring", "feed_pool", "references"),
|
| 541 |
+
("feed_filters", "feed_pool", "references"),
|
| 542 |
+
("feed_heatmap", "feed_scoring", "references"),
|
| 543 |
+
("feed_reciprocal", "feed_scoring", "references"),
|
| 544 |
+
("feed_gradient", "feed_scoring", "references"),
|
| 545 |
+
("feed_redis", "redis_cache", "references"),
|
| 546 |
+
("feed_system", "feed_ws", "references"),
|
| 547 |
+
]
|
| 548 |
+
|
| 549 |
+
for src, tgt, rel in feed_edges:
|
| 550 |
+
edges.append({
|
| 551 |
+
'source': f"sem_{src}", 'target': f"sem_{tgt}",
|
| 552 |
+
'relation': rel,
|
| 553 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 554 |
+
'source_file': 'PLAN/feed_system_documentation.md', 'weight': 1.0,
|
| 555 |
+
})
|
| 556 |
+
|
| 557 |
+
# Logic analysis nodes
|
| 558 |
+
logic_nodes = [
|
| 559 |
+
("logic_analysis", "Logic-Level Async Issue Audit", "PLAN/LOGIC_ANALYSIS.md"),
|
| 560 |
+
("id_ws_reuse", "DISASTROUS: id(ws) Memory Reuse Bug", "PLAN/LOGIC_ANALYSIS.md"),
|
| 561 |
+
("token_refresh_crash", "DISASTROUS: Token Refresh Crash Window", "PLAN/LOGIC_ANALYSIS.md"),
|
| 562 |
+
("pubsub_crash", "DISASTROUS: PubSub Listener Permanent Crash", "PLAN/LOGIC_ANALYSIS.md"),
|
| 563 |
+
("redis_pool_exhaustion", "DISASTROUS: Redis Connection Pool Exhaustion", "PLAN/LOGIC_ANALYSIS.md"),
|
| 564 |
+
("preference_race", "Race Condition: Preference Merge", "PLAN/LOGIC_ANALYSIS.md"),
|
| 565 |
+
]
|
| 566 |
+
|
| 567 |
+
for nid, label, src in logic_nodes:
|
| 568 |
+
nodes.append({
|
| 569 |
+
'id': f"sem_{nid}", 'label': label,
|
| 570 |
+
'file_type': 'document', 'source_file': src,
|
| 571 |
+
})
|
| 572 |
+
|
| 573 |
+
logic_edges = [
|
| 574 |
+
("id_ws_reuse", "unified_ws", "references"),
|
| 575 |
+
("token_refresh_crash", "unified_ws", "references"),
|
| 576 |
+
("pubsub_crash", "redis_cache", "references"),
|
| 577 |
+
("redis_pool_exhaustion", "redis_cache", "references"),
|
| 578 |
+
("preference_race", "supabase_db", "references"),
|
| 579 |
+
("logic_analysis", "nudr_api", "references"),
|
| 580 |
+
]
|
| 581 |
+
|
| 582 |
+
for src, tgt, rel in logic_edges:
|
| 583 |
+
edges.append({
|
| 584 |
+
'source': f"sem_{src}", 'target': f"sem_{tgt}",
|
| 585 |
+
'relation': rel,
|
| 586 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 587 |
+
'source_file': 'PLAN/LOGIC_ANALYSIS.md', 'weight': 1.0,
|
| 588 |
+
})
|
| 589 |
+
|
| 590 |
+
# Hyperedges
|
| 591 |
+
hyperedges = [
|
| 592 |
+
{
|
| 593 |
+
'id': 'websocket_channels',
|
| 594 |
+
'label': 'WebSocket Channel System',
|
| 595 |
+
'nodes': ['sem_unified_ws', 'sem_feed_ws', 'sem_chat_ws', 'sem_keysync_ws', 'sem_discovery_ws'],
|
| 596 |
+
'relation': 'participate_in',
|
| 597 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 598 |
+
'source_file': 'README.md',
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
'id': 'security_stack',
|
| 602 |
+
'label': 'Security Defense Stack',
|
| 603 |
+
'nodes': ['sem_hmac_verification', 'sem_origin_secret', 'sem_pow_challenge', 'sem_rate_limiting', 'sem_attack_detection'],
|
| 604 |
+
'relation': 'participate_in',
|
| 605 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 606 |
+
'source_file': 'README.md',
|
| 607 |
+
},
|
| 608 |
+
{
|
| 609 |
+
'id': 'feed_pipeline',
|
| 610 |
+
'label': 'Feed Recommendation Pipeline',
|
| 611 |
+
'nodes': ['sem_feed_pool', 'sem_feed_filters', 'sem_feed_scoring', 'sem_feed_heatmap', 'sem_feed_reciprocal', 'sem_feed_gradient'],
|
| 612 |
+
'relation': 'form',
|
| 613 |
+
'confidence': 'EXTRACTED', 'confidence_score': 1.0,
|
| 614 |
+
'source_file': 'PLAN/feed_system_documentation.md',
|
| 615 |
+
},
|
| 616 |
+
]
|
| 617 |
+
|
| 618 |
+
print(f" Semantic: {len(nodes)} nodes, {len(edges)} edges, {len(hyperedges)} hyperedges")
|
| 619 |
+
return nodes, edges, hyperedges
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
# ─── Step 4: Merge & Build Graph ─────────────────────────────────────────────
|
| 623 |
+
def merge_and_build(ast_nodes, ast_edges, sem_nodes, sem_edges, hyperedges):
|
| 624 |
+
"""Merge AST + semantic, build NetworkX graph, cluster, analyze."""
|
| 625 |
+
from graphify.build import build_from_json
|
| 626 |
+
from graphify.cluster import cluster, score_all
|
| 627 |
+
from graphify.analyze import god_nodes, surprising_connections, suggest_questions
|
| 628 |
+
|
| 629 |
+
# Merge: AST first, deduplicate semantic by id
|
| 630 |
+
seen = {n['id'] for n in ast_nodes}
|
| 631 |
+
merged_nodes = list(ast_nodes)
|
| 632 |
+
for n in sem_nodes:
|
| 633 |
+
if n['id'] not in seen:
|
| 634 |
+
merged_nodes.append(n)
|
| 635 |
+
seen.add(n['id'])
|
| 636 |
+
|
| 637 |
+
merged_edges = ast_edges + sem_edges
|
| 638 |
+
|
| 639 |
+
extraction = {
|
| 640 |
+
'nodes': merged_nodes,
|
| 641 |
+
'edges': merged_edges,
|
| 642 |
+
'hyperedges': hyperedges,
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
G = build_from_json(extraction)
|
| 646 |
+
communities = cluster(G)
|
| 647 |
+
cohesion = score_all(G, communities)
|
| 648 |
+
gods = god_nodes(G)
|
| 649 |
+
surprises = surprising_connections(G, communities)
|
| 650 |
+
|
| 651 |
+
# Auto-label communities
|
| 652 |
+
labels = {}
|
| 653 |
+
for cid, members in communities.items():
|
| 654 |
+
names = " ".join(members[:10]).lower()
|
| 655 |
+
if 'feed' in names and 'service' in names:
|
| 656 |
+
labels[cid] = "Feed System"
|
| 657 |
+
elif 'feed' in names and ('score' in names or 'pool' in names):
|
| 658 |
+
labels[cid] = "Feed Scoring & Pool"
|
| 659 |
+
elif 'chat' in names and ('ws' in names or 'websocket' in names):
|
| 660 |
+
labels[cid] = "Chat WebSocket"
|
| 661 |
+
elif 'keysync' in names or 'key_exchange' in names:
|
| 662 |
+
labels[cid] = "Key Exchange & Sync"
|
| 663 |
+
elif 'discovery' in names and ('match' in names or 'like' in names):
|
| 664 |
+
labels[cid] = "Discovery & Matching"
|
| 665 |
+
elif 'auth' in names or 'signup' in names or 'signin' in names:
|
| 666 |
+
labels[cid] = "Authentication"
|
| 667 |
+
elif 'payment' in names or 'stripe' in names:
|
| 668 |
+
labels[cid] = "Payments & Billing"
|
| 669 |
+
elif 'setting' in names or 'profile' in names or 'preference' in names:
|
| 670 |
+
labels[cid] = "Settings & Profiles"
|
| 671 |
+
elif 'consent' in names:
|
| 672 |
+
labels[cid] = "Consent System"
|
| 673 |
+
elif 'report' in names or 'violation' in names:
|
| 674 |
+
labels[cid] = "Reporting & Moderation"
|
| 675 |
+
elif 'notification' in names or 'fcm' in names:
|
| 676 |
+
labels[cid] = "Push Notifications"
|
| 677 |
+
elif 'redis' in names or 'cache' in names:
|
| 678 |
+
labels[cid] = "Redis & Caching"
|
| 679 |
+
elif 'supabase' in names or 'migration' in names:
|
| 680 |
+
labels[cid] = "Database Layer"
|
| 681 |
+
elif 'terraform' in names or 'aws' in names or 'vpc' in names:
|
| 682 |
+
labels[cid] = "Infrastructure (Terraform)"
|
| 683 |
+
elif 'security' in names or 'rate_limit' in names or 'attack' in names:
|
| 684 |
+
labels[cid] = "Security & Rate Limiting"
|
| 685 |
+
elif 'codec' in names or 'hmac' in names or 'protobuf' in names:
|
| 686 |
+
labels[cid] = "WebSocket Codec"
|
| 687 |
+
elif 'unified' in names and 'ws' in names:
|
| 688 |
+
labels[cid] = "Unified WebSocket"
|
| 689 |
+
elif 'token' in names:
|
| 690 |
+
labels[cid] = "Token Management"
|
| 691 |
+
elif 'image' in names:
|
| 692 |
+
labels[cid] = "Image Processing"
|
| 693 |
+
elif 'event' in names or 'pending' in names:
|
| 694 |
+
labels[cid] = "Event Queue"
|
| 695 |
+
elif 'linkup' in names:
|
| 696 |
+
labels[cid] = "Linkup System"
|
| 697 |
+
elif 'test' in names:
|
| 698 |
+
labels[cid] = "Tests"
|
| 699 |
+
elif 'nuke' in names or 'script' in names:
|
| 700 |
+
labels[cid] = "Utility Scripts"
|
| 701 |
+
elif 'email' in names or 'otp' in names:
|
| 702 |
+
labels[cid] = "Email & OTP"
|
| 703 |
+
elif 'flutter' in names:
|
| 704 |
+
labels[cid] = "Flutter Directives"
|
| 705 |
+
elif 'readme' in names:
|
| 706 |
+
labels[cid] = "API Documentation"
|
| 707 |
+
else:
|
| 708 |
+
labels[cid] = f"Module Group {cid}"
|
| 709 |
+
|
| 710 |
+
questions = suggest_questions(G, communities, labels)
|
| 711 |
+
|
| 712 |
+
print(f" Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {len(communities)} communities")
|
| 713 |
+
return G, communities, cohesion, labels, gods, surprises, questions, extraction
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
# ─── Step 5: Generate Outputs ────────────────────────────────────────────────
|
| 717 |
+
def generate_outputs(G, communities, cohesion, labels, gods, surprises, questions, detection, extraction):
|
| 718 |
+
"""Generate report, HTML, JSON, and manifest."""
|
| 719 |
+
from graphify.report import generate
|
| 720 |
+
from graphify.export import to_json, to_html
|
| 721 |
+
|
| 722 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 723 |
+
tokens = {'input': 0, 'output': 0}
|
| 724 |
+
|
| 725 |
+
# Report
|
| 726 |
+
report = generate(
|
| 727 |
+
G, communities, cohesion, labels, gods, surprises,
|
| 728 |
+
detection, tokens, str(ROOT), suggested_questions=questions,
|
| 729 |
+
)
|
| 730 |
+
REPORT_PATH.write_text(report, encoding='utf-8')
|
| 731 |
+
print(f" -> {REPORT_PATH.relative_to(ROOT)}")
|
| 732 |
+
|
| 733 |
+
# JSON
|
| 734 |
+
to_json(G, communities, str(JSON_PATH))
|
| 735 |
+
print(f" -> {JSON_PATH.relative_to(ROOT)}")
|
| 736 |
+
|
| 737 |
+
# HTML
|
| 738 |
+
if G.number_of_nodes() <= 5000:
|
| 739 |
+
to_html(G, communities, str(HTML_PATH), community_labels=labels)
|
| 740 |
+
print(f" -> {HTML_PATH.relative_to(ROOT)}")
|
| 741 |
+
else:
|
| 742 |
+
print(f" !! Graph too large for HTML ({G.number_of_nodes()} nodes)")
|
| 743 |
+
|
| 744 |
+
# Manifest
|
| 745 |
+
manifest = {}
|
| 746 |
+
for f in detection.get('files', []):
|
| 747 |
+
manifest[f['path']] = f.get('mtime', 0)
|
| 748 |
+
MANIFEST.write_text(json.dumps(manifest, indent=2), encoding='utf-8')
|
| 749 |
+
|
| 750 |
+
# Cost tracker
|
| 751 |
+
if COST_PATH.exists():
|
| 752 |
+
cost = json.loads(COST_PATH.read_text(encoding='utf-8'))
|
| 753 |
+
else:
|
| 754 |
+
cost = {'runs': [], 'total_input_tokens': 0, 'total_output_tokens': 0}
|
| 755 |
+
cost['runs'].append({
|
| 756 |
+
'date': datetime.now(timezone.utc).isoformat(),
|
| 757 |
+
'nodes': G.number_of_nodes(),
|
| 758 |
+
'edges': G.number_of_edges(),
|
| 759 |
+
'communities': len(communities),
|
| 760 |
+
})
|
| 761 |
+
COST_PATH.write_text(json.dumps(cost, indent=2), encoding='utf-8')
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
# ─── Main Pipeline ───────────────────────────────────────────────────────────
|
| 765 |
+
def run_pipeline(skip_semantic=False):
|
| 766 |
+
"""Execute the full graphify pipeline."""
|
| 767 |
+
start = time.time()
|
| 768 |
+
print("=" * 60)
|
| 769 |
+
print(f"graphify rebuild — {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 770 |
+
print("=" * 60)
|
| 771 |
+
|
| 772 |
+
# Step 1: Detect
|
| 773 |
+
print("\n[1/5] Detecting files...")
|
| 774 |
+
files, total_words = detect_files()
|
| 775 |
+
changed = get_changed_files(files)
|
| 776 |
+
print(f" Found {len(files)} files ({total_words:,} words)")
|
| 777 |
+
print(f" Changed since last build: {len(changed)}")
|
| 778 |
+
|
| 779 |
+
detection = {
|
| 780 |
+
'files': files,
|
| 781 |
+
'total_files': len(files),
|
| 782 |
+
'total_words': total_words,
|
| 783 |
+
'changed_files': len(changed),
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
# Step 2: AST extraction
|
| 787 |
+
print("\n[2/5] AST extraction...")
|
| 788 |
+
ast_nodes, ast_edges = run_ast_extraction(files)
|
| 789 |
+
|
| 790 |
+
# Step 3: Semantic extraction
|
| 791 |
+
if skip_semantic:
|
| 792 |
+
print("\n[3/5] Semantic extraction... SKIPPED (--quick)")
|
| 793 |
+
sem_nodes, sem_edges, hyperedges = [], [], []
|
| 794 |
+
else:
|
| 795 |
+
print("\n[3/5] Semantic extraction...")
|
| 796 |
+
sem_nodes, sem_edges, hyperedges = build_semantic_nodes()
|
| 797 |
+
|
| 798 |
+
# Step 4: Merge & build
|
| 799 |
+
print("\n[4/5] Building graph...")
|
| 800 |
+
G, communities, cohesion, labels, gods, surprises, questions, extraction = \
|
| 801 |
+
merge_and_build(ast_nodes, ast_edges, sem_nodes, sem_edges, hyperedges)
|
| 802 |
+
|
| 803 |
+
# Step 5: Generate outputs
|
| 804 |
+
print("\n[5/5] Generating outputs...")
|
| 805 |
+
generate_outputs(G, communities, cohesion, labels, gods, surprises, questions, detection, extraction)
|
| 806 |
+
|
| 807 |
+
elapsed = time.time() - start
|
| 808 |
+
print(f"\n{'=' * 60}")
|
| 809 |
+
print(f"Done in {elapsed:.1f}s")
|
| 810 |
+
print(f" {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {len(communities)} communities")
|
| 811 |
+
print(f" Open graphify-out/graph.html in your browser")
|
| 812 |
+
print(f"{'=' * 60}")
|
| 813 |
+
|
| 814 |
+
|
| 815 |
+
def watch_mode():
|
| 816 |
+
"""Watch for file changes and rebuild automatically."""
|
| 817 |
+
print("Watching for changes... (Ctrl+C to stop)")
|
| 818 |
+
last_mtimes = {}
|
| 819 |
+
|
| 820 |
+
while True:
|
| 821 |
+
try:
|
| 822 |
+
changed = False
|
| 823 |
+
for dirpath, dirnames, filenames in os.walk(ROOT):
|
| 824 |
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
| 825 |
+
for fname in filenames:
|
| 826 |
+
fpath = Path(dirpath) / fname
|
| 827 |
+
if fpath.suffix.lower() not in CORPUS_EXTENSIONS:
|
| 828 |
+
continue
|
| 829 |
+
try:
|
| 830 |
+
mtime = fpath.stat().st_mtime
|
| 831 |
+
except OSError:
|
| 832 |
+
continue
|
| 833 |
+
key = str(fpath)
|
| 834 |
+
if key in last_mtimes and last_mtimes[key] != mtime:
|
| 835 |
+
rel = fpath.relative_to(ROOT)
|
| 836 |
+
print(f"\n Changed: {rel}")
|
| 837 |
+
changed = True
|
| 838 |
+
last_mtimes[key] = mtime
|
| 839 |
+
|
| 840 |
+
if changed:
|
| 841 |
+
run_pipeline()
|
| 842 |
+
|
| 843 |
+
time.sleep(3)
|
| 844 |
+
except KeyboardInterrupt:
|
| 845 |
+
print("\nStopped watching.")
|
| 846 |
+
break
|
| 847 |
+
|
| 848 |
+
|
| 849 |
+
if __name__ == '__main__':
|
| 850 |
+
parser = argparse.ArgumentParser(description='NudR Knowledge Graph Rebuild')
|
| 851 |
+
parser.add_argument('--watch', action='store_true', help='Watch mode: rebuild on file change')
|
| 852 |
+
parser.add_argument('--quick', action='store_true', help='Quick mode: AST-only, skip semantic')
|
| 853 |
+
args = parser.parse_args()
|
| 854 |
+
|
| 855 |
+
if args.watch:
|
| 856 |
+
run_pipeline(skip_semantic=args.quick)
|
| 857 |
+
watch_mode()
|
| 858 |
+
else:
|
| 859 |
+
run_pipeline(skip_semantic=args.quick)
|
graphify-out/GRAPH_REPORT.md
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Graph Report - /run/media/morpheuslord/Personal_Files/Projects/Rewriter (2026-05-03)
|
| 2 |
+
|
| 3 |
+
## Corpus Check
|
| 4 |
+
- 442 files · ~1,967,332 words
|
| 5 |
+
- Verdict: corpus is large enough that graph structure adds value.
|
| 6 |
+
|
| 7 |
+
## Summary
|
| 8 |
+
- 549 nodes · 873 edges · 27 communities detected
|
| 9 |
+
- Extraction: 76% EXTRACTED · 24% INFERRED · 0% AMBIGUOUS · INFERRED: 208 edges (avg confidence: 0.6)
|
| 10 |
+
- Token cost: 0 input · 0 output
|
| 11 |
+
|
| 12 |
+
## Community Hubs (Navigation)
|
| 13 |
+
- [[_COMMUNITY_Module Group 0|Module Group 0]]
|
| 14 |
+
- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
|
| 15 |
+
- [[_COMMUNITY_Module Group 2|Module Group 2]]
|
| 16 |
+
- [[_COMMUNITY_Module Group 3|Module Group 3]]
|
| 17 |
+
- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
|
| 18 |
+
- [[_COMMUNITY_Module Group 5|Module Group 5]]
|
| 19 |
+
- [[_COMMUNITY_Token Management|Token Management]]
|
| 20 |
+
- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
|
| 21 |
+
- [[_COMMUNITY_Authentication|Authentication]]
|
| 22 |
+
- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
|
| 23 |
+
- [[_COMMUNITY_Module Group 10|Module Group 10]]
|
| 24 |
+
- [[_COMMUNITY_Feed Scoring & Pool|Feed Scoring & Pool]]
|
| 25 |
+
- [[_COMMUNITY_Module Group 12|Module Group 12]]
|
| 26 |
+
- [[_COMMUNITY_Token Management|Token Management]]
|
| 27 |
+
- [[_COMMUNITY_Module Group 14|Module Group 14]]
|
| 28 |
+
- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
|
| 29 |
+
- [[_COMMUNITY_Module Group 16|Module Group 16]]
|
| 30 |
+
- [[_COMMUNITY_Module Group 17|Module Group 17]]
|
| 31 |
+
- [[_COMMUNITY_Module Group 18|Module Group 18]]
|
| 32 |
+
- [[_COMMUNITY_Module Group 19|Module Group 19]]
|
| 33 |
+
- [[_COMMUNITY_Module Group 20|Module Group 20]]
|
| 34 |
+
- [[_COMMUNITY_Infrastructure (Terraform)|Infrastructure (Terraform)]]
|
| 35 |
+
- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
|
| 36 |
+
- [[_COMMUNITY_Module Group 23|Module Group 23]]
|
| 37 |
+
- [[_COMMUNITY_Security & Rate Limiting|Security & Rate Limiting]]
|
| 38 |
+
- [[_COMMUNITY_WebSocket Codec|WebSocket Codec]]
|
| 39 |
+
- [[_COMMUNITY_Module Group 27|Module Group 27]]
|
| 40 |
+
|
| 41 |
+
## God Nodes (most connected - your core abstractions)
|
| 42 |
+
1. `train()` - 34 edges
|
| 43 |
+
2. `__init__()` - 28 edges
|
| 44 |
+
3. `__init__()` - 27 edges
|
| 45 |
+
4. `__init__()` - 27 edges
|
| 46 |
+
5. `__init__()` - 27 edges
|
| 47 |
+
6. `__init__()` - 27 edges
|
| 48 |
+
7. `__init__()` - 27 edges
|
| 49 |
+
8. `__init__()` - 27 edges
|
| 50 |
+
9. `correct()` - 16 edges
|
| 51 |
+
10. `__init__()` - 13 edges
|
| 52 |
+
|
| 53 |
+
## Surprising Connections (you probably didn't know these)
|
| 54 |
+
- `run_inference()` --calls--> `correct()` [INFERRED]
|
| 55 |
+
scripts/run_inference.py → src/preprocessing/spell_corrector.py
|
| 56 |
+
- `train()` --calls--> `__init__()` [INFERRED]
|
| 57 |
+
scripts/train.py → src/training/dataset.py
|
| 58 |
+
- `__init__()` --calls--> `__init__()` [INFERRED]
|
| 59 |
+
scripts/train.py → src/training/dataset.py
|
| 60 |
+
- `score()` --calls--> `forward()` [INFERRED]
|
| 61 |
+
src/training/human_pattern_extractor.py → scripts/train.py
|
| 62 |
+
- `test_spell_correction_empty()` --calls--> `correct()` [INFERRED]
|
| 63 |
+
tests/test_preprocessing.py → src/inference/corrector.py
|
| 64 |
+
|
| 65 |
+
## Hyperedges (group relationships)
|
| 66 |
+
- **WebSocket Channel System** — sem_unified_ws, sem_feed_ws, sem_chat_ws, sem_keysync_ws, sem_discovery_ws [EXTRACTED 1.00]
|
| 67 |
+
- **Security Defense Stack** — sem_hmac_verification, sem_origin_secret, sem_pow_challenge, sem_rate_limiting, sem_attack_detection [EXTRACTED 1.00]
|
| 68 |
+
- **Feed Recommendation Pipeline** — sem_feed_pool, sem_feed_filters, sem_feed_scoring, sem_feed_heatmap, sem_feed_reciprocal, sem_feed_gradient [EXTRACTED 1.00]
|
| 69 |
+
|
| 70 |
+
## Communities
|
| 71 |
+
|
| 72 |
+
### Community 0 - "Module Group 0"
|
| 73 |
+
Cohesion: 0.04
|
| 74 |
+
Nodes (55): EntitySpan, NERTagger, Tags named entities and produces protected spans., Named Entity Recognition tagger.
|
| 75 |
+
Identifies entities (persons, locations, organi, get_protected_spans(), Return (start, end) char spans that must not be modified., tag(), Extract all named entities from text. (+47 more)
|
| 76 |
+
|
| 77 |
+
### Community 1 - "Utility Scripts"
|
| 78 |
+
Cohesion: 0.06
|
| 79 |
+
Nodes (38): Evaluation script.
|
| 80 |
+
Runs all evaluation metrics on the test set.
|
| 81 |
+
Run: python scri, evaluate(), Run evaluation on the specified data split., ERRANTEvaluator, Evaluates grammar correction quality using ERRANT annotations., ERRANT-based grammatical error evaluation.
|
| 82 |
+
Uses the ERRANT toolkit for standardi, evaluate(), Compute ERRANT precision, recall, F0.5. (+30 more)
|
| 83 |
+
|
| 84 |
+
### Community 2 - "Module Group 2"
|
| 85 |
+
Cohesion: 0.07
|
| 86 |
+
Nodes (36): StyleFingerprinter, Extracts style fingerprint vectors from text samples., StyleProjectionMLP, Projects raw feature vector to 512-dim style embedding., _avg_dep_tree_depth(), Compute average dependency tree depth across all tokens., _avg_syllables_per_word(), Average syllables per word. (+28 more)
|
| 87 |
+
|
| 88 |
+
### Community 3 - "Module Group 3"
|
| 89 |
+
Cohesion: 0.06
|
| 90 |
+
Nodes (35): AWLLoader, Loads and manages Academic Word List data., _load_synonyms(), Load academic synonym mappings from JSON., _load_word_list(), Load a word list file into a set of lowercase words., all_words(), Return the full set of academic words. (+27 more)
|
| 91 |
+
|
| 92 |
+
### Community 4 - "Utility Scripts"
|
| 93 |
+
Cohesion: 0.31
|
| 94 |
+
Nodes (34): __init__(), CEOnlyLoss, Cross-entropy only loss — the only loss that provides gradient signal., __init__(), _auto_batch_size(), Pick optimal batch size based on model size and available resources., _setup_device(), Detect GPU and configure hybrid VRAM management.
|
| 95 |
+
|
| 96 |
+
Returns (device, gpu_info) whe (+26 more)
|
| 97 |
+
|
| 98 |
+
### Community 5 - "Module Group 5"
|
| 99 |
+
Cohesion: 0.08
|
| 100 |
+
Nodes (29): DyslexiaSimulator, Generates synthetic dyslectic text from clean input for data augmentation., _double_letter(), Double a random interior letter., _omit_letter(), Remove a random interior letter., _reverse_letter(), Swap b/d, p/q style reversals. (+21 more)
|
| 101 |
+
|
| 102 |
+
### Community 6 - "Token Management"
|
| 103 |
+
Cohesion: 0.07
|
| 104 |
+
Nodes (28): Loads and wraps the base pretrained model.
|
| 105 |
+
Supported architectures:
|
| 106 |
+
- google/f, load_model_and_tokenizer(), Load a pretrained model with optional LoRA and quantization.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
model_ke, apply_lora(), Apply LoRA adapters to a model and return the wrapped model., create_lora_config(), Create a LoRA configuration for the given task type., LoRA adapter configuration and management.
|
| 110 |
+
Wraps PEFT LoRA utilities for applyin (+20 more)
|
| 111 |
+
|
| 112 |
+
### Community 7 - "Utility Scripts"
|
| 113 |
+
Cohesion: 0.08
|
| 114 |
+
Nodes (28): Pre-trains the HumanPatternClassifier on both Kaggle datasets.
|
| 115 |
+
Run this BEFORE t, train_classifier(), Pre-train the human pattern classifier on Kaggle datasets., forward(), HumanPatternClassifier, Lightweight MLP trained to distinguish human from AI writing.
|
| 116 |
+
Input: feature vec, HumanPatternFeatureExtractor, Extracts 17-dimensional feature vector encoding human vs AI writing patterns.
|
| 117 |
+
|
| 118 |
+
O (+20 more)
|
| 119 |
+
|
| 120 |
+
### Community 8 - "Authentication"
|
| 121 |
+
Cohesion: 0.08
|
| 122 |
+
Nodes (27): AuthorshipVerifier, Verifies authorship consistency between input and output text., Authorship verification module.
|
| 123 |
+
Uses a fine-tuned model to verify whether the co, verify(), Return probability that both texts were written by the same author.
|
| 124 |
+
|
| 125 |
+
Uses senten, average_style_vectors(), Compute the mean style vector from a list of vectors., cosine_similarity() (+19 more)
|
| 126 |
+
|
| 127 |
+
### Community 9 - "Utility Scripts"
|
| 128 |
+
Cohesion: 0.08
|
| 129 |
+
Nodes (25): Interactive inference script.
|
| 130 |
+
Run: python scripts/run_inference.py --config conf, run_inference(), Run inference on text input., correct_text(), Correct dyslectic text with style preservation and academic elevation., FastAPI server for the Dyslexia Academic Writing Corrector API.
|
| 131 |
+
Provides RESTful, health(), Health check endpoint. (+17 more)
|
| 132 |
+
|
| 133 |
+
### Community 10 - "Module Group 10"
|
| 134 |
+
Cohesion: 0.1
|
| 135 |
+
Nodes (27): _get_call_name(), Extract callable name from ast.Call node., _get_name(), Extract name from various AST node types., _resolve_edges(), Post-process edges to resolve bare names to actual node IDs.
|
| 136 |
+
|
| 137 |
+
The per-file AST e, build_semantic_nodes(), Build semantic nodes from documentation files.
|
| 138 |
+
These capture high-level architec (+19 more)
|
| 139 |
+
|
| 140 |
+
### Community 11 - "Feed Scoring & Pool"
|
| 141 |
+
Cohesion: 0.08
|
| 142 |
+
Nodes (27): Chat WebSocket Channel, Discovery WebSocket Channel, E2EE X25519 Key Exchange, FastAPI Stateless Backend, Feed Hard Filters (12 Rules), 3-Tier Gradient Distribution, Preference Heatmap (Learned AI), Feed Pool Computation Pipeline (+19 more)
|
| 143 |
+
|
| 144 |
+
### Community 12 - "Module Group 12"
|
| 145 |
+
Cohesion: 0.12
|
| 146 |
+
Nodes (22): GLEU, (Note: This script computes sentence-level GLEU score.)
|
| 147 |
+
|
| 148 |
+
This script calculates , get_gleu_stats(), calculate mean and confidence interval from all GLEU iterations, get_ngram_counts(), get ngrams of order n for a tokenized sentence, get_ngram_diff(), returns ngrams in a but not in b (+14 more)
|
| 149 |
+
|
| 150 |
+
### Community 13 - "Token Management"
|
| 151 |
+
Cohesion: 0.16
|
| 152 |
+
Nodes (17): clean_para(), convert_char_to_tok(), get_all_tok_starts_and_ends(), get_paras(), get_sents(), get_token_edits(), main(), noop_edit() (+9 more)
|
| 153 |
+
|
| 154 |
+
### Community 14 - "Module Group 14"
|
| 155 |
+
Cohesion: 0.13
|
| 156 |
+
Nodes (14): FormalityClassifier, Scores text formality on a 0-1 scale using rule-based heuristics., Formality classifier module.
|
| 157 |
+
Classifies text on a 0-1 formality scale using ling, score(), Return formality score in [0, 1]. Higher = more formal.
|
| 158 |
+
|
| 159 |
+
Scoring based on:
|
| 160 |
+
- Con, RegisterFilterAdvanced, Advanced register filtering with nominalisation and hedging passes., add_hedging() (+6 more)
|
| 161 |
+
|
| 162 |
+
### Community 15 - "Utility Scripts"
|
| 163 |
+
Cohesion: 0.2
|
| 164 |
+
Nodes (14): apply_bea19_edits(), Apply BEA-2019 character-level edits to produce corrected text.
|
| 165 |
+
|
| 166 |
+
edits_block for, create_splits(), Split train.jsonl into train and val sets., Converts all raw dataset formats into unified JSONL training format.
|
| 167 |
+
Output sche, main(), process_bea19_json(), Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
|
| 168 |
+
Each line is a JSON ob (+6 more)
|
| 169 |
+
|
| 170 |
+
### Community 16 - "Module Group 16"
|
| 171 |
+
Cohesion: 0.24
|
| 172 |
+
Nodes (9): CorrectionTrainer, Custom trainer — uses model's built-in loss directly., _strip_custom_fields(), Remove dataset fields that T5 doesn't accept., compute_loss(), Use model's built-in CE loss — avoids double-computing logits loss., Custom HuggingFace Trainer subclass.
|
| 173 |
+
Uses the model's built-in cross-entropy los, prediction_step() (+1 more)
|
| 174 |
+
|
| 175 |
+
### Community 17 - "Module Group 17"
|
| 176 |
+
Cohesion: 0.29
|
| 177 |
+
Nodes (5): RateLimitMiddleware, Simple in-memory rate limiting., RequestLoggingMiddleware, Logs all incoming requests with timing information., API middleware for request logging, rate limiting, and error handling.
|
| 178 |
+
|
| 179 |
+
### Community 18 - "Module Group 18"
|
| 180 |
+
Cohesion: 0.29
|
| 181 |
+
Nodes (5): EarlyStoppingOnStyleDrift, Stops training if style similarity drops below threshold., StyleMetricsCallback, Logs style similarity metrics during evaluation., Training callbacks for monitoring and checkpointing.
|
| 182 |
+
Integrates with Weights & B
|
| 183 |
+
|
| 184 |
+
### Community 19 - "Module Group 19"
|
| 185 |
+
Cohesion: 0.33
|
| 186 |
+
Nodes (5): EmotionClassifier, Classifies emotional register of text using keyword-based analysis., classify(), Return emotion distribution over register categories.
|
| 187 |
+
|
| 188 |
+
Returns a dict with keys:, Emotion/register classifier module.
|
| 189 |
+
Classifies text emotional register (neutral,
|
| 190 |
+
|
| 191 |
+
### Community 20 - "Module Group 20"
|
| 192 |
+
Cohesion: 0.5
|
| 193 |
+
Nodes (3): CorrectionRequest, CorrectionResponse, Pydantic schemas for API request/response validation.
|
| 194 |
+
|
| 195 |
+
### Community 21 - "Infrastructure (Terraform)"
|
| 196 |
+
Cohesion: 0.5
|
| 197 |
+
Nodes (4): ALB + Auto Scaling Group, AWS Secrets Manager Integration, Terraform AWS Infrastructure, VPC Network Topology
|
| 198 |
+
|
| 199 |
+
### Community 22 - "Utility Scripts"
|
| 200 |
+
Cohesion: 0.67
|
| 201 |
+
Nodes (1): Downloads all publicly available HuggingFace datasets automatically.
|
| 202 |
+
Datasets re
|
| 203 |
+
|
| 204 |
+
### Community 23 - "Module Group 23"
|
| 205 |
+
Cohesion: 0.67
|
| 206 |
+
Nodes (3): Cloudflare Edge Proxy, Lambda Origin Secret Rotator, X-Origin-Secret Middleware
|
| 207 |
+
|
| 208 |
+
### Community 24 - "Security & Rate Limiting"
|
| 209 |
+
Cohesion: 1.0
|
| 210 |
+
Nodes (2): Attack Detection & IP Risk Management, Per-IP Rate Limiting
|
| 211 |
+
|
| 212 |
+
### Community 26 - "WebSocket Codec"
|
| 213 |
+
Cohesion: 1.0
|
| 214 |
+
Nodes (1): HMAC-SHA256 Request Verification
|
| 215 |
+
|
| 216 |
+
### Community 27 - "Module Group 27"
|
| 217 |
+
Cohesion: 1.0
|
| 218 |
+
Nodes (1): Proof-of-Work Challenge
|
| 219 |
+
|
| 220 |
+
## Knowledge Gaps
|
| 221 |
+
- **259 isolated node(s):** `graphify_rebuild.py — One-shot NudR knowledge graph regeneration.
|
| 222 |
+
|
| 223 |
+
Usage:
|
| 224 |
+
py`, `Walk the project and return list of relevant files with metadata.`, `Compare against manifest to find changed files.`, `SHA-256 hash for cache keying.`, `Extract AST nodes and edges from a single Python file.` (+254 more)
|
| 225 |
+
These have ≤1 connection - possible missing edges or undocumented components.
|
| 226 |
+
- **Thin community `Utility Scripts`** (3 nodes): `download_all_huggingface_datasets.py`, `Downloads all publicly available HuggingFace datasets automatically.
|
| 227 |
+
Datasets re`, `main()`
|
| 228 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 229 |
+
- **Thin community `Security & Rate Limiting`** (2 nodes): `Attack Detection & IP Risk Management`, `Per-IP Rate Limiting`
|
| 230 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 231 |
+
- **Thin community `WebSocket Codec`** (1 nodes): `HMAC-SHA256 Request Verification`
|
| 232 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 233 |
+
- **Thin community `Module Group 27`** (1 nodes): `Proof-of-Work Challenge`
|
| 234 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 235 |
+
|
| 236 |
+
## Suggested Questions
|
| 237 |
+
_Questions this graph is uniquely positioned to answer:_
|
| 238 |
+
|
| 239 |
+
- **Why does `parse()` connect `Token Management` to `Utility Scripts`, `Module Group 10`?**
|
| 240 |
+
_High betweenness centrality (0.125) - this node is a cross-community bridge._
|
| 241 |
+
- **Why does `correct()` connect `Utility Scripts` to `Module Group 0`, `Utility Scripts`, `Module Group 2`, `Module Group 3`?**
|
| 242 |
+
_High betweenness centrality (0.092) - this node is a cross-community bridge._
|
| 243 |
+
- **Why does `extract_ast_file()` connect `Module Group 10` to `Token Management`?**
|
| 244 |
+
_High betweenness centrality (0.083) - this node is a cross-community bridge._
|
| 245 |
+
- **Are the 26 inferred relationships involving `train()` (e.g. with `__init__()` and `__init__()`) actually correct?**
|
| 246 |
+
_`train()` has 26 INFERRED edges - model-reasoned connections that need verification._
|
| 247 |
+
- **Are the 26 inferred relationships involving `__init__()` (e.g. with `train()` and `__init__()`) actually correct?**
|
| 248 |
+
_`__init__()` has 26 INFERRED edges - model-reasoned connections that need verification._
|
| 249 |
+
- **Are the 26 inferred relationships involving `__init__()` (e.g. with `train()` and `__init__()`) actually correct?**
|
| 250 |
+
_`__init__()` has 26 INFERRED edges - model-reasoned connections that need verification._
|
| 251 |
+
- **Are the 26 inferred relationships involving `__init__()` (e.g. with `train()` and `__init__()`) actually correct?**
|
| 252 |
+
_`__init__()` has 26 INFERRED edges - model-reasoned connections that need verification._
|
graphify-out/cost.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"runs": [
|
| 3 |
+
{
|
| 4 |
+
"date": "2026-05-02T14:10:36.766309+00:00",
|
| 5 |
+
"nodes": 527,
|
| 6 |
+
"edges": 791,
|
| 7 |
+
"communities": 27
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"date": "2026-05-02T14:38:36.641525+00:00",
|
| 11 |
+
"nodes": 527,
|
| 12 |
+
"edges": 791,
|
| 13 |
+
"communities": 27
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"date": "2026-05-02T15:18:12.036397+00:00",
|
| 17 |
+
"nodes": 535,
|
| 18 |
+
"edges": 803,
|
| 19 |
+
"communities": 26
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"date": "2026-05-02T15:51:26.719125+00:00",
|
| 23 |
+
"nodes": 541,
|
| 24 |
+
"edges": 861,
|
| 25 |
+
"communities": 27
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"date": "2026-05-03T09:17:56.530188+00:00",
|
| 29 |
+
"nodes": 549,
|
| 30 |
+
"edges": 873,
|
| 31 |
+
"communities": 28
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"total_input_tokens": 0,
|
| 35 |
+
"total_output_tokens": 0
|
| 36 |
+
}
|
graphify-out/graph.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
graphify-out/graph.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
graphify-out/manifest.json
ADDED
|
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
".gitignore": 1777701225.48157,
|
| 3 |
+
".train_stage1_done": 1777720832.7039733,
|
| 4 |
+
"docker-compose.yml": 1777700239.9815593,
|
| 5 |
+
"Dockerfile": 1777700223.9820845,
|
| 6 |
+
"graph_codebase.py": 1777726565.2849488,
|
| 7 |
+
"Plan.MD": 1777699641.0691116,
|
| 8 |
+
"pyproject.toml": 1777700221.3256943,
|
| 9 |
+
"README.md": 1777724083.753812,
|
| 10 |
+
"requirements-dev.txt": 1777700217.251423,
|
| 11 |
+
"requirements.txt": 1777711524.5703418,
|
| 12 |
+
"start.sh": 1777710701.3410938,
|
| 13 |
+
"todo_registry.md": 1777710900.2597346,
|
| 14 |
+
"train.sh": 1777714202.448383,
|
| 15 |
+
"checkpoints/human_pattern_classifier.pt": 1777729878.0041149,
|
| 16 |
+
"checkpoints/best_model/adapter_config.json": 1777796825.1490068,
|
| 17 |
+
"checkpoints/best_model/adapter_model.safetensors": 1777796825.1843758,
|
| 18 |
+
"checkpoints/best_model/README.md": 1777796719.1975436,
|
| 19 |
+
"checkpoints/best_model/special_tokens_map.json": 1777796824.250464,
|
| 20 |
+
"checkpoints/best_model/spiece.model": 1777796824.2559996,
|
| 21 |
+
"checkpoints/best_model/tokenizer.json": 1777796824.275346,
|
| 22 |
+
"checkpoints/best_model/tokenizer_config.json": 1777796824.2501526,
|
| 23 |
+
"checkpoints/best_model_merged/config.json": 1777798859.073088,
|
| 24 |
+
"checkpoints/best_model_merged/generation_config.json": 1777798859.0735242,
|
| 25 |
+
"checkpoints/best_model_merged/model.safetensors": 1777798860.1068735,
|
| 26 |
+
"checkpoints/best_model_merged/special_tokens_map.json": 1777798860.7994945,
|
| 27 |
+
"checkpoints/best_model_merged/spiece.model": 1777798860.8052647,
|
| 28 |
+
"checkpoints/best_model_merged/tokenizer.json": 1777798860.8245687,
|
| 29 |
+
"checkpoints/best_model_merged/tokenizer_config.json": 1777798860.799184,
|
| 30 |
+
"checkpoints/checkpoint-1300/adapter_config.json": 1777795732.5574954,
|
| 31 |
+
"checkpoints/checkpoint-1300/adapter_model.safetensors": 1777795732.5569153,
|
| 32 |
+
"checkpoints/checkpoint-1300/optimizer.pt": 1777795732.6929996,
|
| 33 |
+
"checkpoints/checkpoint-1300/README.md": 1777795723.992747,
|
| 34 |
+
"checkpoints/checkpoint-1300/rng_state.pth": 1777795732.6944063,
|
| 35 |
+
"checkpoints/checkpoint-1300/scheduler.pt": 1777795732.6934369,
|
| 36 |
+
"checkpoints/checkpoint-1300/trainer_state.json": 1777795732.6950734,
|
| 37 |
+
"checkpoints/checkpoint-1300/training_args.bin": 1777795732.5581925,
|
| 38 |
+
"checkpoints/checkpoint-1500/adapter_config.json": 1777796453.407065,
|
| 39 |
+
"checkpoints/checkpoint-1500/adapter_model.safetensors": 1777796453.4062335,
|
| 40 |
+
"checkpoints/checkpoint-1500/optimizer.pt": 1777796453.5532222,
|
| 41 |
+
"checkpoints/checkpoint-1500/README.md": 1777796441.7245266,
|
| 42 |
+
"checkpoints/checkpoint-1500/rng_state.pth": 1777796453.5546432,
|
| 43 |
+
"checkpoints/checkpoint-1500/scheduler.pt": 1777796453.553674,
|
| 44 |
+
"checkpoints/checkpoint-1500/trainer_state.json": 1777796453.5553663,
|
| 45 |
+
"checkpoints/checkpoint-1500/training_args.bin": 1777796453.4078538,
|
| 46 |
+
"checkpoints/checkpoint-1515/adapter_config.json": 1777796517.9945214,
|
| 47 |
+
"checkpoints/checkpoint-1515/adapter_model.safetensors": 1777796517.9887915,
|
| 48 |
+
"checkpoints/checkpoint-1515/optimizer.pt": 1777796518.4291868,
|
| 49 |
+
"checkpoints/checkpoint-1515/README.md": 1777796516.2816606,
|
| 50 |
+
"checkpoints/checkpoint-1515/rng_state.pth": 1777796518.4307976,
|
| 51 |
+
"checkpoints/checkpoint-1515/scheduler.pt": 1777796518.4297302,
|
| 52 |
+
"checkpoints/checkpoint-1515/trainer_state.json": 1777796518.4315622,
|
| 53 |
+
"checkpoints/checkpoint-1515/training_args.bin": 1777796518.001258,
|
| 54 |
+
"configs/awl_config.yaml": 1777700189.1299732,
|
| 55 |
+
"configs/inference_config.yaml": 1777799286.2174134,
|
| 56 |
+
"configs/model_config.yaml": 1777700179.166181,
|
| 57 |
+
"configs/training_config.yaml": 1777790468.5972416,
|
| 58 |
+
"configs/training_config_fast.yaml": 1777790475.887508,
|
| 59 |
+
"data/awl/academic_synonyms.json": 1777700285.7700574,
|
| 60 |
+
"data/awl/coxhead_awl.txt": 1777700281.261102,
|
| 61 |
+
"data/awl/domain_lexicons/humanities.txt": 1777700291.8022907,
|
| 62 |
+
"data/awl/domain_lexicons/sciences.txt": 1777700297.4100578,
|
| 63 |
+
"data/awl/domain_lexicons/social_sciences.txt": 1777700299.4182992,
|
| 64 |
+
"data/cache/1356ff2104663316.pt": 1777790287.6153314,
|
| 65 |
+
"data/cache/d6a64358c3ef403f.pt": 1777790307.475935,
|
| 66 |
+
"data/processed/test.jsonl": 1777720842.027336,
|
| 67 |
+
"data/processed/train.jsonl": 1777720841.9981437,
|
| 68 |
+
"data/processed/val.jsonl": 1777720842.0180135,
|
| 69 |
+
"data/raw/fce_v2.1.bea19.tar.gz": 1777701562.259877,
|
| 70 |
+
"data/raw/wi+locness_v2.1.bea19.tar.gz": 1777701601.202943,
|
| 71 |
+
"data/raw/fce/json_to_m2.py": 1593697400.0,
|
| 72 |
+
"data/raw/fce/licence.txt": 1548259834.0,
|
| 73 |
+
"data/raw/fce/readme.txt": 1593701121.0,
|
| 74 |
+
"data/raw/fce/json/fce.dev.json": 1551887927.0,
|
| 75 |
+
"data/raw/fce/json/fce.test.json": 1551887927.0,
|
| 76 |
+
"data/raw/fce/json/fce.train.json": 1551887928.0,
|
| 77 |
+
"data/raw/fce/m2/fce.dev.gold.bea19.m2": 1551908535.0,
|
| 78 |
+
"data/raw/fce/m2/fce.test.gold.bea19.m2": 1551908549.0,
|
| 79 |
+
"data/raw/fce/m2/fce.train.gold.bea19.m2": 1551908611.0,
|
| 80 |
+
"data/raw/hf/gpt_wiki_intro/data-00000-of-00001.arrow": 1777704055.4466302,
|
| 81 |
+
"data/raw/hf/gpt_wiki_intro/dataset_info.json": 1777704055.4477787,
|
| 82 |
+
"data/raw/hf/gpt_wiki_intro/state.json": 1777704055.4473596,
|
| 83 |
+
"data/raw/hf/mage/data-00000-of-00001.arrow": 1777704009.6226566,
|
| 84 |
+
"data/raw/hf/mage/dataset_info.json": 1777704009.623809,
|
| 85 |
+
"data/raw/hf/mage/state.json": 1777704009.6233914,
|
| 86 |
+
"data/raw/hf/paws/data-00000-of-00001.arrow": 1777704298.0143042,
|
| 87 |
+
"data/raw/hf/paws/dataset_info.json": 1777704298.0152135,
|
| 88 |
+
"data/raw/hf/paws/state.json": 1777704298.0148978,
|
| 89 |
+
"data/raw/hf/raid/data-00000-of-00025.arrow": 1777703696.3333108,
|
| 90 |
+
"data/raw/hf/raid/data-00001-of-00025.arrow": 1777703698.4878266,
|
| 91 |
+
"data/raw/hf/raid/data-00002-of-00025.arrow": 1777703700.7023206,
|
| 92 |
+
"data/raw/hf/raid/data-00003-of-00025.arrow": 1777703712.7551422,
|
| 93 |
+
"data/raw/hf/raid/data-00004-of-00025.arrow": 1777703715.8790066,
|
| 94 |
+
"data/raw/hf/raid/data-00005-of-00025.arrow": 1777703727.0471604,
|
| 95 |
+
"data/raw/hf/raid/data-00006-of-00025.arrow": 1777703739.229002,
|
| 96 |
+
"data/raw/hf/raid/data-00007-of-00025.arrow": 1777703750.4085863,
|
| 97 |
+
"data/raw/hf/raid/data-00008-of-00025.arrow": 1777703753.7418487,
|
| 98 |
+
"data/raw/hf/raid/data-00009-of-00025.arrow": 1777703767.0649137,
|
| 99 |
+
"data/raw/hf/raid/data-00010-of-00025.arrow": 1777703770.6492746,
|
| 100 |
+
"data/raw/hf/raid/data-00011-of-00025.arrow": 1777703779.966218,
|
| 101 |
+
"data/raw/hf/raid/data-00012-of-00025.arrow": 1777703782.763389,
|
| 102 |
+
"data/raw/hf/raid/data-00013-of-00025.arrow": 1777703794.4995651,
|
| 103 |
+
"data/raw/hf/raid/data-00014-of-00025.arrow": 1777703797.4540114,
|
| 104 |
+
"data/raw/hf/raid/data-00015-of-00025.arrow": 1777703808.532667,
|
| 105 |
+
"data/raw/hf/raid/data-00016-of-00025.arrow": 1777703813.8672874,
|
| 106 |
+
"data/raw/hf/raid/data-00017-of-00025.arrow": 1777703827.7822654,
|
| 107 |
+
"data/raw/hf/raid/data-00018-of-00025.arrow": 1777703839.699836,
|
| 108 |
+
"data/raw/hf/raid/data-00019-of-00025.arrow": 1777703847.619066,
|
| 109 |
+
"data/raw/hf/raid/data-00020-of-00025.arrow": 1777703850.5027363,
|
| 110 |
+
"data/raw/hf/raid/data-00021-of-00025.arrow": 1777703862.0215914,
|
| 111 |
+
"data/raw/hf/raid/data-00022-of-00025.arrow": 1777703872.856046,
|
| 112 |
+
"data/raw/hf/raid/data-00023-of-00025.arrow": 1777703883.6765664,
|
| 113 |
+
"data/raw/hf/raid/data-00024-of-00025.arrow": 1777703904.8737774,
|
| 114 |
+
"data/raw/hf/raid/dataset_info.json": 1777703904.8914242,
|
| 115 |
+
"data/raw/hf/raid/state.json": 1777703904.8853946,
|
| 116 |
+
"data/raw/hf/wikitext103/data-00000-of-00002.arrow": 1777704280.352249,
|
| 117 |
+
"data/raw/hf/wikitext103/data-00001-of-00002.arrow": 1777704282.4038906,
|
| 118 |
+
"data/raw/hf/wikitext103/dataset_info.json": 1777704282.4051147,
|
| 119 |
+
"data/raw/hf/wikitext103/state.json": 1777704282.4046695,
|
| 120 |
+
"data/raw/hf/writing_prompts/data-00000-of-00002.arrow": 1777704198.527498,
|
| 121 |
+
"data/raw/hf/writing_prompts/data-00001-of-00002.arrow": 1777704201.3078794,
|
| 122 |
+
"data/raw/hf/writing_prompts/dataset_info.json": 1777704201.3090239,
|
| 123 |
+
"data/raw/hf/writing_prompts/state.json": 1777704201.3085868,
|
| 124 |
+
"data/raw/jfleg/test.ref0": 1777701409.8719044,
|
| 125 |
+
"data/raw/jfleg/test.ref1": 1777701409.8726854,
|
| 126 |
+
"data/raw/jfleg/test.ref2": 1777701409.8734703,
|
| 127 |
+
"data/raw/jfleg/test.ref3": 1777701409.8742514,
|
| 128 |
+
"data/raw/jfleg/test.spellchecked.src": 1777701409.8642416,
|
| 129 |
+
"data/raw/jfleg/test.src": 1777701409.8653388,
|
| 130 |
+
"data/raw/jfleg_repo/EACLshort037.pdf": 1777701409.8443322,
|
| 131 |
+
"data/raw/jfleg_repo/README.md": 1777701409.8446841,
|
| 132 |
+
"data/raw/jfleg_repo/dev/dev.ref0": 1777701409.8457215,
|
| 133 |
+
"data/raw/jfleg_repo/dev/dev.ref1": 1777701409.846624,
|
| 134 |
+
"data/raw/jfleg_repo/dev/dev.ref2": 1777701409.8473954,
|
| 135 |
+
"data/raw/jfleg_repo/dev/dev.ref3": 1777701409.8481197,
|
| 136 |
+
"data/raw/jfleg_repo/dev/dev.spellchecked.src": 1777701409.8490207,
|
| 137 |
+
"data/raw/jfleg_repo/dev/dev.src": 1777701409.8498135,
|
| 138 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/dev.ref.m2": 1777701409.8316338,
|
| 139 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/getpostagger.sh": 1777701409.8319283,
|
| 140 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/m2converter.py": 1777701409.8322287,
|
| 141 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/README.md": 1777701409.8256564,
|
| 142 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/test.ref.m2": 1777701409.8371184,
|
| 143 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/util/assignIOB.py": 1777701409.838248,
|
| 144 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/util/edit_dist.py": 1777701409.8386652,
|
| 145 |
+
"data/raw/jfleg_repo/EACL_exp/m2converter/util/__init__.py": 1777701409.8375704,
|
| 146 |
+
"data/raw/jfleg_repo/EACL_exp/manual_eval/coded_sentences.csv": 1777701409.839717,
|
| 147 |
+
"data/raw/jfleg_repo/EACL_exp/manual_eval/README.md": 1777701409.8391445,
|
| 148 |
+
"data/raw/jfleg_repo/EACL_exp/mturk/pairwise.csv": 1777701409.8407602,
|
| 149 |
+
"data/raw/jfleg_repo/EACL_exp/mturk/sample.csv": 1777701409.8410628,
|
| 150 |
+
"data/raw/jfleg_repo/EACL_exp/mturk/template.html": 1777701409.841373,
|
| 151 |
+
"data/raw/jfleg_repo/eval/gleu.py": 1777701409.8503115,
|
| 152 |
+
"data/raw/jfleg_repo/eval/readme.md": 1777701409.8506277,
|
| 153 |
+
"data/raw/jfleg_repo/test/test.ref0": 1777701409.8520677,
|
| 154 |
+
"data/raw/jfleg_repo/test/test.ref1": 1777701409.8528638,
|
| 155 |
+
"data/raw/jfleg_repo/test/test.ref2": 1777701409.8536794,
|
| 156 |
+
"data/raw/jfleg_repo/test/test.ref3": 1777701409.8544674,
|
| 157 |
+
"data/raw/jfleg_repo/test/test.spellchecked.src": 1777701409.8553677,
|
| 158 |
+
"data/raw/jfleg_repo/test/test.src": 1777701409.8561919,
|
| 159 |
+
"data/raw/shanegerami/AI_Human.csv": 1777701568.543233,
|
| 160 |
+
"data/raw/starblasters8/data.csv": 1777703040.4595706,
|
| 161 |
+
"data/raw/starblasters8/data.parquet": 1777703067.4076133,
|
| 162 |
+
"data/raw/starblasters8/distribution.csv": 1777703067.4080453,
|
| 163 |
+
"data/raw/starblasters8/distribution.parquet": 1777703067.4084356,
|
| 164 |
+
"data/raw/starblasters8/prompts.csv": 1777703067.4240563,
|
| 165 |
+
"data/raw/starblasters8/prompts.parquet": 1777703067.4288754,
|
| 166 |
+
"data/raw/wi+locness/json_to_m2.py": 1593701174.0,
|
| 167 |
+
"data/raw/wi+locness/licence.wi.txt": 1548261267.0,
|
| 168 |
+
"data/raw/wi+locness/license.locness.txt": 1548344432.0,
|
| 169 |
+
"data/raw/wi+locness/readme.txt": 1593702230.0,
|
| 170 |
+
"data/raw/wi+locness/json/A.dev.json": 1548254108.0,
|
| 171 |
+
"data/raw/wi+locness/json/A.train.json": 1548254108.0,
|
| 172 |
+
"data/raw/wi+locness/json/B.dev.json": 1548254108.0,
|
| 173 |
+
"data/raw/wi+locness/json/B.train.json": 1548254108.0,
|
| 174 |
+
"data/raw/wi+locness/json/C.dev.json": 1548254108.0,
|
| 175 |
+
"data/raw/wi+locness/json/C.train.json": 1548254108.0,
|
| 176 |
+
"data/raw/wi+locness/json/N.dev.json": 1548255672.0,
|
| 177 |
+
"data/raw/wi+locness/m2/A.dev.gold.bea19.m2": 1551909610.0,
|
| 178 |
+
"data/raw/wi+locness/m2/A.train.gold.bea19.m2": 1551909604.0,
|
| 179 |
+
"data/raw/wi+locness/m2/ABC.train.gold.bea19.m2": 1593702095.0,
|
| 180 |
+
"data/raw/wi+locness/m2/ABCN.dev.gold.bea19.m2": 1551909944.0,
|
| 181 |
+
"data/raw/wi+locness/m2/B.dev.gold.bea19.m2": 1551909651.0,
|
| 182 |
+
"data/raw/wi+locness/m2/B.train.gold.bea19.m2": 1551909644.0,
|
| 183 |
+
"data/raw/wi+locness/m2/C.dev.gold.bea19.m2": 1551909684.0,
|
| 184 |
+
"data/raw/wi+locness/m2/C.train.gold.bea19.m2": 1551909678.0,
|
| 185 |
+
"data/raw/wi+locness/m2/N.dev.gold.bea19.m2": 1551909694.0,
|
| 186 |
+
"data/raw/wi+locness/test/ABCN.test.bea19.orig": 1593701979.0,
|
| 187 |
+
"data/raw/wi+locness/test/readme.txt": 1593702932.0,
|
| 188 |
+
"logs/events.out.tfevents.1777733169.bazzite.202618.0": 1777733169.3767228,
|
| 189 |
+
"logs/events.out.tfevents.1777733440.bazzite.206325.0": 1777733440.2441843,
|
| 190 |
+
"logs/events.out.tfevents.1777733727.bazzite.207730.0": 1777733727.503944,
|
| 191 |
+
"logs/events.out.tfevents.1777734559.bazzite.211747.0": 1777734559.4917176,
|
| 192 |
+
"logs/events.out.tfevents.1777735849.bazzite.215021.0": 1777735849.6431587,
|
| 193 |
+
"logs/events.out.tfevents.1777737794.bazzite.222265.0": 1777737794.4041593,
|
| 194 |
+
"logs/events.out.tfevents.1777738485.bazzite.226596.0": 1777738485.9317763,
|
| 195 |
+
"logs/events.out.tfevents.1777785111.bazzite.5847.0": 1777788329.172026,
|
| 196 |
+
"logs/events.out.tfevents.1777790308.bazzite.14979.0": 1777790308.4039745,
|
| 197 |
+
"logs/events.out.tfevents.1777790432.bazzite.18166.0": 1777790432.2569437,
|
| 198 |
+
"logs/events.out.tfevents.1777790600.bazzite.19895.0": 1777790600.9711528,
|
| 199 |
+
"logs/events.out.tfevents.1777790916.bazzite.22954.0": 1777791352.7881691,
|
| 200 |
+
"logs/events.out.tfevents.1777791700.bazzite.29722.0": 1777792139.67899,
|
| 201 |
+
"logs/events.out.tfevents.1777792299.bazzite.34388.0": 1777796441.5121546,
|
| 202 |
+
"scripts/download_all_huggingface_datasets.py": 1777702146.005388,
|
| 203 |
+
"scripts/download_datasets.sh": 1777700679.976215,
|
| 204 |
+
"scripts/download_kaggle_datasets.sh": 1777700695.699875,
|
| 205 |
+
"scripts/evaluate.py": 1777710622.2847967,
|
| 206 |
+
"scripts/preprocess_data.py": 1777701728.828645,
|
| 207 |
+
"scripts/pretrain_human_pattern_classifier.py": 1777710565.377371,
|
| 208 |
+
"scripts/run_inference.py": 1777710636.728075,
|
| 209 |
+
"scripts/train.py": 1777796693.4284217,
|
| 210 |
+
"src/__init__.py": 1777700367.1651394,
|
| 211 |
+
"src/api/main.py": 1777710501.3492658,
|
| 212 |
+
"src/api/middleware.py": 1777710502.144811,
|
| 213 |
+
"src/api/schemas.py": 1777700655.5228736,
|
| 214 |
+
"src/api/__init__.py": 1777700367.176363,
|
| 215 |
+
"src/evaluation/authorship_verifier.py": 1777710422.882881,
|
| 216 |
+
"src/evaluation/errant_evaluator.py": 1777710414.1773353,
|
| 217 |
+
"src/evaluation/gleu_scorer.py": 1777710402.1214068,
|
| 218 |
+
"src/evaluation/style_metrics.py": 1777710421.8995192,
|
| 219 |
+
"src/evaluation/__init__.py": 1777700367.1744816,
|
| 220 |
+
"src/inference/corrector.py": 1777799272.1892536,
|
| 221 |
+
"src/inference/postprocessor.py": 1777799529.931668,
|
| 222 |
+
"src/inference/__init__.py": 1777700367.1754317,
|
| 223 |
+
"src/model/base_model.py": 1777789062.6184208,
|
| 224 |
+
"src/model/generation_utils.py": 1777710219.7970757,
|
| 225 |
+
"src/model/lora_adapter.py": 1777710206.3699143,
|
| 226 |
+
"src/model/style_conditioner.py": 1777789195.3776248,
|
| 227 |
+
"src/model/__init__.py": 1777700367.1716762,
|
| 228 |
+
"src/preprocessing/dependency_parser.py": 1777709958.1169899,
|
| 229 |
+
"src/preprocessing/dyslexia_simulator.py": 1777709998.6640317,
|
| 230 |
+
"src/preprocessing/ner_tagger.py": 1777709980.1368325,
|
| 231 |
+
"src/preprocessing/pipeline.py": 1777710000.6269286,
|
| 232 |
+
"src/preprocessing/sentence_segmenter.py": 1777709951.8658924,
|
| 233 |
+
"src/preprocessing/spell_corrector.py": 1777710998.2651775,
|
| 234 |
+
"src/preprocessing/__init__.py": 1777700367.1695316,
|
| 235 |
+
"src/style/emotion_classifier.py": 1777710084.9253688,
|
| 236 |
+
"src/style/fingerprinter.py": 1777733588.7603915,
|
| 237 |
+
"src/style/formality_classifier.py": 1777710041.056987,
|
| 238 |
+
"src/style/style_vector.py": 1777710029.7282178,
|
| 239 |
+
"src/style/__init__.py": 1777700367.1707523,
|
| 240 |
+
"src/training/callbacks.py": 1777710375.39277,
|
| 241 |
+
"src/training/dataset.py": 1777736946.1787465,
|
| 242 |
+
"src/training/human_pattern_extractor.py": 1777721296.1845315,
|
| 243 |
+
"src/training/loss_functions.py": 1777734093.3399415,
|
| 244 |
+
"src/training/trainer.py": 1777792224.759529,
|
| 245 |
+
"src/training/__init__.py": 1777700367.172702,
|
| 246 |
+
"src/vocabulary/awl_loader.py": 1777710137.5959558,
|
| 247 |
+
"src/vocabulary/lexical_substitution.py": 1777799073.7536068,
|
| 248 |
+
"src/vocabulary/register_filter.py": 1777711030.1810205,
|
| 249 |
+
"src/vocabulary/__init__.py": 1777700367.1736517,
|
| 250 |
+
"tests/test_evaluation.py": 1777710754.1602647,
|
| 251 |
+
"tests/test_model.py": 1777710746.0170994,
|
| 252 |
+
"tests/test_preprocessing.py": 1777710730.7286103,
|
| 253 |
+
"tests/test_style.py": 1777710738.944049,
|
| 254 |
+
"tests/test_vocabulary.py": 1777710752.9497588,
|
| 255 |
+
"wandb/debug-internal.log": 1777796523.944181,
|
| 256 |
+
"wandb/debug.log": 1777796521.577159,
|
| 257 |
+
"wandb/run-20260502_150043-2fg22e6p/run-2fg22e6p.wandb": 1777720317.7069192,
|
| 258 |
+
"wandb/run-20260502_150043-2fg22e6p/files/config.yaml": 1777720313.7898095,
|
| 259 |
+
"wandb/run-20260502_150043-2fg22e6p/files/output.log": 1777720313.775867,
|
| 260 |
+
"wandb/run-20260502_150043-2fg22e6p/files/requirements.txt": 1777714246.1567795,
|
| 261 |
+
"wandb/run-20260502_150043-2fg22e6p/files/wandb-metadata.json": 1777714246.3533409,
|
| 262 |
+
"wandb/run-20260502_150043-2fg22e6p/files/wandb-summary.json": 1777720313.7819676,
|
| 263 |
+
"wandb/run-20260502_150043-2fg22e6p/logs/debug-core.log": 1777720317.7154906,
|
| 264 |
+
"wandb/run-20260502_150043-2fg22e6p/logs/debug-internal.log": 1777720317.7080636,
|
| 265 |
+
"wandb/run-20260502_150043-2fg22e6p/logs/debug.log": 1777720313.7399838,
|
| 266 |
+
"wandb/run-20260502_165105-pwnhqrrf/run-pwnhqrrf.wandb": 1777720942.0482328,
|
| 267 |
+
"wandb/run-20260502_165105-pwnhqrrf/files/config.yaml": 1777720940.353539,
|
| 268 |
+
"wandb/run-20260502_165105-pwnhqrrf/files/output.log": 1777720940.3449767,
|
| 269 |
+
"wandb/run-20260502_165105-pwnhqrrf/files/requirements.txt": 1777720873.2094295,
|
| 270 |
+
"wandb/run-20260502_165105-pwnhqrrf/files/wandb-metadata.json": 1777720871.3923895,
|
| 271 |
+
"wandb/run-20260502_165105-pwnhqrrf/files/wandb-summary.json": 1777720940.3480256,
|
| 272 |
+
"wandb/run-20260502_165105-pwnhqrrf/logs/debug-core.log": 1777720942.0548975,
|
| 273 |
+
"wandb/run-20260502_165105-pwnhqrrf/logs/debug-internal.log": 1777720942.049499,
|
| 274 |
+
"wandb/run-20260502_165105-pwnhqrrf/logs/debug.log": 1777720940.2928586,
|
| 275 |
+
"wandb/run-20260502_165541-4d797dih/run-4d797dih.wandb": 1777721193.2341652,
|
| 276 |
+
"wandb/run-20260502_165541-4d797dih/files/config.yaml": 1777721190.8295085,
|
| 277 |
+
"wandb/run-20260502_165541-4d797dih/files/output.log": 1777721190.8289042,
|
| 278 |
+
"wandb/run-20260502_165541-4d797dih/files/requirements.txt": 1777721148.3496826,
|
| 279 |
+
"wandb/run-20260502_165541-4d797dih/files/wandb-metadata.json": 1777721146.848307,
|
| 280 |
+
"wandb/run-20260502_165541-4d797dih/files/wandb-summary.json": 1777721190.829134,
|
| 281 |
+
"wandb/run-20260502_165541-4d797dih/logs/debug-core.log": 1777721193.237868,
|
| 282 |
+
"wandb/run-20260502_165541-4d797dih/logs/debug-internal.log": 1777721193.2342758,
|
| 283 |
+
"wandb/run-20260502_165541-4d797dih/logs/debug.log": 1777721190.8276412,
|
| 284 |
+
"wandb/run-20260502_165926-36ppiwlg/run-36ppiwlg.wandb": 1777729880.6180944,
|
| 285 |
+
"wandb/run-20260502_165926-36ppiwlg/files/config.yaml": 1777729878.7727947,
|
| 286 |
+
"wandb/run-20260502_165926-36ppiwlg/files/output.log": 1777729878.0086646,
|
| 287 |
+
"wandb/run-20260502_165926-36ppiwlg/files/requirements.txt": 1777721373.2203636,
|
| 288 |
+
"wandb/run-20260502_165926-36ppiwlg/files/wandb-metadata.json": 1777721371.7235086,
|
| 289 |
+
"wandb/run-20260502_165926-36ppiwlg/files/wandb-summary.json": 1777729878.0101912,
|
| 290 |
+
"wandb/run-20260502_165926-36ppiwlg/logs/debug-core.log": 1777729880.6459498,
|
| 291 |
+
"wandb/run-20260502_165926-36ppiwlg/logs/debug-internal.log": 1777729880.61819,
|
| 292 |
+
"wandb/run-20260502_165926-36ppiwlg/logs/debug.log": 1777729880.617355,
|
| 293 |
+
"wandb/run-20260502_192151-h1jq4pkw/run-h1jq4pkw.wandb": 1777731875.1844988,
|
| 294 |
+
"wandb/run-20260502_192151-h1jq4pkw/files/config.yaml": 1777731873.207762,
|
| 295 |
+
"wandb/run-20260502_192151-h1jq4pkw/files/output.log": 1777731873.1929135,
|
| 296 |
+
"wandb/run-20260502_192151-h1jq4pkw/files/requirements.txt": 1777729913.3605578,
|
| 297 |
+
"wandb/run-20260502_192151-h1jq4pkw/files/wandb-metadata.json": 1777729913.5862672,
|
| 298 |
+
"wandb/run-20260502_192151-h1jq4pkw/files/wandb-summary.json": 1777731873.1960742,
|
| 299 |
+
"wandb/run-20260502_192151-h1jq4pkw/logs/debug-core.log": 1777731875.1935635,
|
| 300 |
+
"wandb/run-20260502_192151-h1jq4pkw/logs/debug-internal.log": 1777731875.1859224,
|
| 301 |
+
"wandb/run-20260502_192151-h1jq4pkw/logs/debug.log": 1777731873.1659987,
|
| 302 |
+
"wandb/run-20260502_200514-kl2gg5g9/run-kl2gg5g9.wandb": 1777733212.1297417,
|
| 303 |
+
"wandb/run-20260502_200514-kl2gg5g9/files/config.yaml": 1777733209.4133239,
|
| 304 |
+
"wandb/run-20260502_200514-kl2gg5g9/files/output.log": 1777733209.4107795,
|
| 305 |
+
"wandb/run-20260502_200514-kl2gg5g9/files/requirements.txt": 1777732516.9594064,
|
| 306 |
+
"wandb/run-20260502_200514-kl2gg5g9/files/wandb-metadata.json": 1777732517.0559525,
|
| 307 |
+
"wandb/run-20260502_200514-kl2gg5g9/files/wandb-summary.json": 1777733209.411088,
|
| 308 |
+
"wandb/run-20260502_200514-kl2gg5g9/logs/debug-core.log": 1777733212.141411,
|
| 309 |
+
"wandb/run-20260502_200514-kl2gg5g9/logs/debug-internal.log": 1777733212.1310723,
|
| 310 |
+
"wandb/run-20260502_200514-kl2gg5g9/logs/debug.log": 1777733209.404857,
|
| 311 |
+
"wandb/run-20260502_201947-ngpyijum/run-ngpyijum.wandb": 1777733478.2693913,
|
| 312 |
+
"wandb/run-20260502_201947-ngpyijum/files/config.yaml": 1777733476.6917355,
|
| 313 |
+
"wandb/run-20260502_201947-ngpyijum/files/output.log": 1777733476.6534271,
|
| 314 |
+
"wandb/run-20260502_201947-ngpyijum/files/requirements.txt": 1777733389.9631994,
|
| 315 |
+
"wandb/run-20260502_201947-ngpyijum/files/wandb-metadata.json": 1777733390.1321378,
|
| 316 |
+
"wandb/run-20260502_201947-ngpyijum/files/wandb-summary.json": 1777733476.656282,
|
| 317 |
+
"wandb/run-20260502_201947-ngpyijum/logs/debug-core.log": 1777733478.2785654,
|
| 318 |
+
"wandb/run-20260502_201947-ngpyijum/logs/debug-internal.log": 1777733478.2707117,
|
| 319 |
+
"wandb/run-20260502_201947-ngpyijum/logs/debug.log": 1777733476.5978289,
|
| 320 |
+
"wandb/run-20260502_202439-7n7pnref/run-7n7pnref.wandb": 1777733793.9917243,
|
| 321 |
+
"wandb/run-20260502_202439-7n7pnref/files/config.yaml": 1777733792.4110804,
|
| 322 |
+
"wandb/run-20260502_202439-7n7pnref/files/output.log": 1777733792.3750265,
|
| 323 |
+
"wandb/run-20260502_202439-7n7pnref/files/requirements.txt": 1777733681.1639447,
|
| 324 |
+
"wandb/run-20260502_202439-7n7pnref/files/wandb-metadata.json": 1777733681.322012,
|
| 325 |
+
"wandb/run-20260502_202439-7n7pnref/files/wandb-summary.json": 1777733792.378697,
|
| 326 |
+
"wandb/run-20260502_202439-7n7pnref/logs/debug-core.log": 1777733793.9979222,
|
| 327 |
+
"wandb/run-20260502_202439-7n7pnref/logs/debug-internal.log": 1777733793.9930737,
|
| 328 |
+
"wandb/run-20260502_202439-7n7pnref/logs/debug.log": 1777733792.3520947,
|
| 329 |
+
"wandb/run-20260502_203519-fib23yhh/run-fib23yhh.wandb": 1777734591.343925,
|
| 330 |
+
"wandb/run-20260502_203519-fib23yhh/files/output.log": 1777734994.8215227,
|
| 331 |
+
"wandb/run-20260502_203519-fib23yhh/files/requirements.txt": 1777734321.4982467,
|
| 332 |
+
"wandb/run-20260502_203519-fib23yhh/files/wandb-metadata.json": 1777734321.6402895,
|
| 333 |
+
"wandb/run-20260502_203519-fib23yhh/logs/debug-core.log": 1777735036.470297,
|
| 334 |
+
"wandb/run-20260502_203519-fib23yhh/logs/debug-internal.log": 1777735026.9774451,
|
| 335 |
+
"wandb/run-20260502_203519-fib23yhh/logs/debug.log": 1777734559.486897,
|
| 336 |
+
"wandb/run-20260502_204834-03roqvb7/run-03roqvb7.wandb": 1777735857.4507105,
|
| 337 |
+
"wandb/run-20260502_204834-03roqvb7/files/config.yaml": 1777735855.5278394,
|
| 338 |
+
"wandb/run-20260502_204834-03roqvb7/files/output.log": 1777735854.776806,
|
| 339 |
+
"wandb/run-20260502_204834-03roqvb7/files/requirements.txt": 1777735116.9440887,
|
| 340 |
+
"wandb/run-20260502_204834-03roqvb7/files/wandb-metadata.json": 1777735117.0886073,
|
| 341 |
+
"wandb/run-20260502_204834-03roqvb7/files/wandb-summary.json": 1777735854.7797687,
|
| 342 |
+
"wandb/run-20260502_204834-03roqvb7/logs/debug-core.log": 1777735857.4598973,
|
| 343 |
+
"wandb/run-20260502_204834-03roqvb7/logs/debug-internal.log": 1777735857.451936,
|
| 344 |
+
"wandb/run-20260502_204834-03roqvb7/logs/debug.log": 1777735854.702104,
|
| 345 |
+
"wandb/run-20260502_210534-j0t4q38m/run-j0t4q38m.wandb": 1777736782.127596,
|
| 346 |
+
"wandb/run-20260502_210534-j0t4q38m/files/config.yaml": 1777736780.0784059,
|
| 347 |
+
"wandb/run-20260502_210534-j0t4q38m/files/output.log": 1777736780.0776114,
|
| 348 |
+
"wandb/run-20260502_210534-j0t4q38m/files/requirements.txt": 1777736140.9308562,
|
| 349 |
+
"wandb/run-20260502_210534-j0t4q38m/files/wandb-metadata.json": 1777736139.3660376,
|
| 350 |
+
"wandb/run-20260502_210534-j0t4q38m/files/wandb-summary.json": 1777736780.0778146,
|
| 351 |
+
"wandb/run-20260502_210534-j0t4q38m/logs/debug-core.log": 1777736782.1309361,
|
| 352 |
+
"wandb/run-20260502_210534-j0t4q38m/logs/debug-internal.log": 1777736782.1277256,
|
| 353 |
+
"wandb/run-20260502_210534-j0t4q38m/logs/debug.log": 1777736780.076756,
|
| 354 |
+
"wandb/run-20260502_212127-vl8pftkj/run-vl8pftkj.wandb": 1777737801.436665,
|
| 355 |
+
"wandb/run-20260502_212127-vl8pftkj/files/config.yaml": 1777737799.6158743,
|
| 356 |
+
"wandb/run-20260502_212127-vl8pftkj/files/output.log": 1777737798.8592515,
|
| 357 |
+
"wandb/run-20260502_212127-vl8pftkj/files/requirements.txt": 1777737089.2927256,
|
| 358 |
+
"wandb/run-20260502_212127-vl8pftkj/files/wandb-metadata.json": 1777737089.4481473,
|
| 359 |
+
"wandb/run-20260502_212127-vl8pftkj/files/wandb-summary.json": 1777737798.8655431,
|
| 360 |
+
"wandb/run-20260502_212127-vl8pftkj/logs/debug-core.log": 1777737801.4842963,
|
| 361 |
+
"wandb/run-20260502_212127-vl8pftkj/logs/debug-internal.log": 1777737801.4381168,
|
| 362 |
+
"wandb/run-20260502_212127-vl8pftkj/logs/debug.log": 1777737798.7922306,
|
| 363 |
+
"wandb/run-20260502_213822-mmm9bdu9/run-mmm9bdu9.wandb": 1777738718.0236964,
|
| 364 |
+
"wandb/run-20260502_213822-mmm9bdu9/files/config.yaml": 1777738715.2277923,
|
| 365 |
+
"wandb/run-20260502_213822-mmm9bdu9/files/output.log": 1777738715.1982114,
|
| 366 |
+
"wandb/run-20260502_213822-mmm9bdu9/files/requirements.txt": 1777738106.5846484,
|
| 367 |
+
"wandb/run-20260502_213822-mmm9bdu9/files/wandb-metadata.json": 1777738105.154441,
|
| 368 |
+
"wandb/run-20260502_213822-mmm9bdu9/files/wandb-summary.json": 1777738715.2006595,
|
| 369 |
+
"wandb/run-20260502_213822-mmm9bdu9/logs/debug-core.log": 1777738718.0312166,
|
| 370 |
+
"wandb/run-20260502_213822-mmm9bdu9/logs/debug-internal.log": 1777738718.0251148,
|
| 371 |
+
"wandb/run-20260502_213822-mmm9bdu9/logs/debug.log": 1777738715.1964543,
|
| 372 |
+
"wandb/run-20260503_104137-zjr4w5ln/run-zjr4w5ln.wandb": 1777789114.0511775,
|
| 373 |
+
"wandb/run-20260503_104137-zjr4w5ln/files/config.yaml": 1777789229.2851222,
|
| 374 |
+
"wandb/run-20260503_104137-zjr4w5ln/files/output.log": 1777789229.2830012,
|
| 375 |
+
"wandb/run-20260503_104137-zjr4w5ln/files/requirements.txt": 1777785104.199556,
|
| 376 |
+
"wandb/run-20260503_104137-zjr4w5ln/files/wandb-metadata.json": 1777785102.3896415,
|
| 377 |
+
"wandb/run-20260503_104137-zjr4w5ln/files/wandb-summary.json": 1777789229.283297,
|
| 378 |
+
"wandb/run-20260503_104137-zjr4w5ln/logs/debug-core.log": 1777789229.6806114,
|
| 379 |
+
"wandb/run-20260503_104137-zjr4w5ln/logs/debug-internal.log": 1777789229.2004015,
|
| 380 |
+
"wandb/run-20260503_104137-zjr4w5ln/logs/debug.log": 1777789229.5057423,
|
| 381 |
+
"wandb/run-20260503_120130-xzkygl93/run-xzkygl93.wandb": 1777789897.810659,
|
| 382 |
+
"wandb/run-20260503_120130-xzkygl93/files/config.yaml": 1777789896.1235933,
|
| 383 |
+
"wandb/run-20260503_120130-xzkygl93/files/output.log": 1777789895.7228522,
|
| 384 |
+
"wandb/run-20260503_120130-xzkygl93/files/requirements.txt": 1777789895.719956,
|
| 385 |
+
"wandb/run-20260503_120130-xzkygl93/files/wandb-metadata.json": 1777789895.5577607,
|
| 386 |
+
"wandb/run-20260503_120130-xzkygl93/files/wandb-summary.json": 1777789895.7230475,
|
| 387 |
+
"wandb/run-20260503_120130-xzkygl93/logs/debug-core.log": 1777789897.8687205,
|
| 388 |
+
"wandb/run-20260503_120130-xzkygl93/logs/debug-internal.log": 1777789897.810803,
|
| 389 |
+
"wandb/run-20260503_120130-xzkygl93/logs/debug.log": 1777789895.7187955,
|
| 390 |
+
"wandb/run-20260503_120403-cbb6slr5/run-cbb6slr5.wandb": 1777790315.1058269,
|
| 391 |
+
"wandb/run-20260503_120403-cbb6slr5/files/config.yaml": 1777790312.1412077,
|
| 392 |
+
"wandb/run-20260503_120403-cbb6slr5/files/output.log": 1777790311.667267,
|
| 393 |
+
"wandb/run-20260503_120403-cbb6slr5/files/requirements.txt": 1777790045.13399,
|
| 394 |
+
"wandb/run-20260503_120403-cbb6slr5/files/wandb-metadata.json": 1777790045.282678,
|
| 395 |
+
"wandb/run-20260503_120403-cbb6slr5/files/wandb-summary.json": 1777790311.6676607,
|
| 396 |
+
"wandb/run-20260503_120403-cbb6slr5/logs/debug-core.log": 1777790315.1408749,
|
| 397 |
+
"wandb/run-20260503_120403-cbb6slr5/logs/debug-internal.log": 1777790315.1073751,
|
| 398 |
+
"wandb/run-20260503_120403-cbb6slr5/logs/debug.log": 1777790311.6647966,
|
| 399 |
+
"wandb/run-20260503_121016-impcgg4z/run-impcgg4z.wandb": 1777790434.7027235,
|
| 400 |
+
"wandb/run-20260503_121016-impcgg4z/files/config.yaml": 1777790433.1677222,
|
| 401 |
+
"wandb/run-20260503_121016-impcgg4z/files/output.log": 1777790432.6490448,
|
| 402 |
+
"wandb/run-20260503_121016-impcgg4z/files/requirements.txt": 1777790418.176722,
|
| 403 |
+
"wandb/run-20260503_121016-impcgg4z/files/wandb-metadata.json": 1777790418.3191545,
|
| 404 |
+
"wandb/run-20260503_121016-impcgg4z/files/wandb-summary.json": 1777790432.6492898,
|
| 405 |
+
"wandb/run-20260503_121016-impcgg4z/logs/debug-core.log": 1777790434.707022,
|
| 406 |
+
"wandb/run-20260503_121016-impcgg4z/logs/debug-internal.log": 1777790434.7028248,
|
| 407 |
+
"wandb/run-20260503_121016-impcgg4z/logs/debug.log": 1777790434.7233517,
|
| 408 |
+
"wandb/run-20260503_121312-l9gn41e7/run-l9gn41e7.wandb": 1777790731.9692764,
|
| 409 |
+
"wandb/run-20260503_121312-l9gn41e7/files/config.yaml": 1777790729.3662996,
|
| 410 |
+
"wandb/run-20260503_121312-l9gn41e7/files/output.log": 1777790729.3632636,
|
| 411 |
+
"wandb/run-20260503_121312-l9gn41e7/files/requirements.txt": 1777790594.305007,
|
| 412 |
+
"wandb/run-20260503_121312-l9gn41e7/files/wandb-metadata.json": 1777790594.4471908,
|
| 413 |
+
"wandb/run-20260503_121312-l9gn41e7/files/wandb-summary.json": 1777790729.363634,
|
| 414 |
+
"wandb/run-20260503_121312-l9gn41e7/logs/debug-core.log": 1777790731.9724958,
|
| 415 |
+
"wandb/run-20260503_121312-l9gn41e7/logs/debug-internal.log": 1777790731.9694047,
|
| 416 |
+
"wandb/run-20260503_121312-l9gn41e7/logs/debug.log": 1777790729.3612194,
|
| 417 |
+
"wandb/run-20260503_121828-7pvaltt8/run-7pvaltt8.wandb": 1777791356.9240425,
|
| 418 |
+
"wandb/run-20260503_121828-7pvaltt8/files/config.yaml": 1777791353.5784223,
|
| 419 |
+
"wandb/run-20260503_121828-7pvaltt8/files/output.log": 1777791353.5761926,
|
| 420 |
+
"wandb/run-20260503_121828-7pvaltt8/files/requirements.txt": 1777790910.2114842,
|
| 421 |
+
"wandb/run-20260503_121828-7pvaltt8/files/wandb-metadata.json": 1777790910.3638337,
|
| 422 |
+
"wandb/run-20260503_121828-7pvaltt8/files/wandb-summary.json": 1777791353.5765028,
|
| 423 |
+
"wandb/run-20260503_121828-7pvaltt8/logs/debug-core.log": 1777791356.92785,
|
| 424 |
+
"wandb/run-20260503_121828-7pvaltt8/logs/debug-internal.log": 1777791356.9241953,
|
| 425 |
+
"wandb/run-20260503_121828-7pvaltt8/logs/debug.log": 1777791353.573493,
|
| 426 |
+
"wandb/run-20260503_123131-4y9tqaim/run-4y9tqaim.wandb": 1777792146.311411,
|
| 427 |
+
"wandb/run-20260503_123131-4y9tqaim/files/config.yaml": 1777792144.4709916,
|
| 428 |
+
"wandb/run-20260503_123131-4y9tqaim/files/output.log": 1777792143.9418323,
|
| 429 |
+
"wandb/run-20260503_123131-4y9tqaim/files/requirements.txt": 1777791693.6865625,
|
| 430 |
+
"wandb/run-20260503_123131-4y9tqaim/files/wandb-metadata.json": 1777791693.8298368,
|
| 431 |
+
"wandb/run-20260503_123131-4y9tqaim/files/wandb-summary.json": 1777792143.9447422,
|
| 432 |
+
"wandb/run-20260503_123131-4y9tqaim/logs/debug-core.log": 1777792146.3145404,
|
| 433 |
+
"wandb/run-20260503_123131-4y9tqaim/logs/debug-internal.log": 1777792146.3116014,
|
| 434 |
+
"wandb/run-20260503_123131-4y9tqaim/logs/debug.log": 1777792143.9181907,
|
| 435 |
+
"wandb/run-20260503_124131-7q4dwe22/run-7q4dwe22.wandb": 1777796523.9426115,
|
| 436 |
+
"wandb/run-20260503_124131-7q4dwe22/files/config.yaml": 1777796521.6113658,
|
| 437 |
+
"wandb/run-20260503_124131-7q4dwe22/files/output.log": 1777796520.1061456,
|
| 438 |
+
"wandb/run-20260503_124131-7q4dwe22/files/requirements.txt": 1777792293.6231525,
|
| 439 |
+
"wandb/run-20260503_124131-7q4dwe22/files/wandb-metadata.json": 1777792293.7842615,
|
| 440 |
+
"wandb/run-20260503_124131-7q4dwe22/files/wandb-summary.json": 1777796521.5802517,
|
| 441 |
+
"wandb/run-20260503_124131-7q4dwe22/logs/debug-core.log": 1777796524.016464,
|
| 442 |
+
"wandb/run-20260503_124131-7q4dwe22/logs/debug-internal.log": 1777796523.944181,
|
| 443 |
+
"wandb/run-20260503_124131-7q4dwe22/logs/debug.log": 1777796521.577159
|
| 444 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68.0", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "dyslexia-writing-ai"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Style-preserving, grammar-correcting, academic vocabulary elevating AI model for dyslectic writing"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
license = {text = "MIT"}
|
| 11 |
+
requires-python = ">=3.10"
|
| 12 |
+
authors = [
|
| 13 |
+
{name = "morpheuslord"},
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
[project.scripts]
|
| 17 |
+
train = "scripts.train:train"
|
| 18 |
+
|
| 19 |
+
[tool.setuptools.packages.find]
|
| 20 |
+
include = ["src*"]
|
| 21 |
+
|
| 22 |
+
[tool.black]
|
| 23 |
+
line-length = 120
|
| 24 |
+
target-version = ["py310"]
|
| 25 |
+
|
| 26 |
+
[tool.ruff]
|
| 27 |
+
line-length = 120
|
| 28 |
+
target-version = "py310"
|
| 29 |
+
|
| 30 |
+
[tool.ruff.lint]
|
| 31 |
+
select = ["E", "F", "W", "I", "N", "UP"]
|
| 32 |
+
|
| 33 |
+
[tool.mypy]
|
| 34 |
+
python_version = "3.10"
|
| 35 |
+
warn_return_any = true
|
| 36 |
+
warn_unused_configs = true
|
| 37 |
+
ignore_missing_imports = true
|
| 38 |
+
|
| 39 |
+
[tool.pytest.ini_options]
|
| 40 |
+
testpaths = ["tests"]
|
| 41 |
+
asyncio_mode = "auto"
|
| 42 |
+
addopts = "-v --tb=short"
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytest==8.1.1
|
| 2 |
+
pytest-asyncio==0.23.6
|
| 3 |
+
pytest-cov==5.0.0
|
| 4 |
+
black==24.4.0
|
| 5 |
+
ruff==0.4.1
|
| 6 |
+
mypy==1.9.0
|
| 7 |
+
pre-commit==3.7.0
|
| 8 |
+
ipykernel==6.29.4
|
| 9 |
+
jupyter==1.0.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Core ML & Deep Learning ──────────────────────────────────────────────────
|
| 2 |
+
torch>=2.9.0
|
| 3 |
+
torchvision>=0.20.0
|
| 4 |
+
torchaudio>=2.9.0
|
| 5 |
+
transformers>=4.40.0
|
| 6 |
+
datasets>=2.18.0
|
| 7 |
+
accelerate>=0.29.0
|
| 8 |
+
peft>=0.10.0 # LoRA / parameter-efficient fine-tuning
|
| 9 |
+
bitsandbytes>=0.43.0 # 8-bit & 4-bit quantization
|
| 10 |
+
sentencepiece>=0.2.0 # T5 tokenizer dependency
|
| 11 |
+
protobuf>=4.25.0 # T5 tokenizer dependency
|
| 12 |
+
|
| 13 |
+
# ── Sentence Embeddings ───────────────────────────────────────────────────────
|
| 14 |
+
sentence-transformers>=2.6.0
|
| 15 |
+
faiss-cpu>=1.8.0 # Vector similarity search
|
| 16 |
+
|
| 17 |
+
# ── NLP Pre-Processing ────────────────────────────────────────────────────────
|
| 18 |
+
spacy>=3.7.0
|
| 19 |
+
spacy-transformers>=1.3.0
|
| 20 |
+
language-tool-python>=2.7.0 # LanguageTool grammar checker
|
| 21 |
+
pyspellchecker>=0.8.0 # Context-free spell check (pre-pass)
|
| 22 |
+
nltk>=3.8.0
|
| 23 |
+
textstat>=0.7.0 # Readability scores (Flesch-Kincaid, etc.)
|
| 24 |
+
|
| 25 |
+
# ── Lexical Substitution ─────────────────────────────────────────────────────
|
| 26 |
+
wordfreq>=3.1.0 # Word frequency data
|
| 27 |
+
|
| 28 |
+
# ── Training Infrastructure ───────────────────────────────────────────────────
|
| 29 |
+
wandb>=0.16.0 # Experiment tracking
|
| 30 |
+
tensorboard>=2.16.0
|
| 31 |
+
numpy>=1.26.0
|
| 32 |
+
pandas>=2.2.0
|
| 33 |
+
scikit-learn>=1.4.0
|
| 34 |
+
scipy>=1.13.0
|
| 35 |
+
|
| 36 |
+
# ── Evaluation Tools ──────────────────────────────────────────────────────────
|
| 37 |
+
errant>=2.3.0 # Grammar Error Annotation Toolkit
|
| 38 |
+
sacrebleu>=2.4.0 # BLEU/GLEU scoring
|
| 39 |
+
bert-score>=0.3.13 # Semantic similarity scoring
|
| 40 |
+
rouge-score>=0.1.2
|
| 41 |
+
|
| 42 |
+
# ── API Server ────────────────────────────────────────────────────────────────
|
| 43 |
+
fastapi>=0.110.0
|
| 44 |
+
uvicorn[standard]>=0.29.0
|
| 45 |
+
pydantic>=2.7.0
|
| 46 |
+
python-multipart>=0.0.9
|
| 47 |
+
httpx>=0.27.0
|
| 48 |
+
|
| 49 |
+
# ── Inference Optimisation ────────────────────────────────────────────────────
|
| 50 |
+
optimum>=1.19.0 # Hugging Face model optimisation
|
| 51 |
+
|
| 52 |
+
# ── Utilities ─────────────────────────────────────────────────────────────────
|
| 53 |
+
pyyaml>=6.0.1
|
| 54 |
+
tqdm>=4.66.0
|
| 55 |
+
loguru>=0.7.0
|
| 56 |
+
python-dotenv>=1.0.0
|
| 57 |
+
click>=8.1.0
|
| 58 |
+
rich>=13.7.0 # Beautiful terminal output
|
| 59 |
+
joblib>=1.4.0
|
scripts/download_all_huggingface_datasets.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Downloads all publicly available HuggingFace datasets automatically.
|
| 3 |
+
Datasets requiring registration/request are flagged with instructions.
|
| 4 |
+
|
| 5 |
+
Run: python scripts/download_all_huggingface_datasets.py
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
os.makedirs("data/raw/hf", exist_ok=True)
|
| 12 |
+
|
| 13 |
+
# (hf_identifier, config, split, output_subdir)
|
| 14 |
+
# Removed trust_remote_code — deprecated in newer datasets versions.
|
| 15 |
+
# Removed datasets that no longer exist or require custom loading scripts.
|
| 16 |
+
HF_DATASETS = [
|
| 17 |
+
("liamdugan/raid", None, "train", "raid"),
|
| 18 |
+
("Hello-SimpleAI/HC3", "all", "train", "hc3"),
|
| 19 |
+
("yaful/MAGE", None, "train", "mage"),
|
| 20 |
+
("aadityaubhat/GPT-wiki-intro", None, "train", "gpt_wiki_intro"),
|
| 21 |
+
("euclaise/writingprompts", None, "train", "writing_prompts"),
|
| 22 |
+
("wikitext", "wikitext-103-raw-v1", "train", "wikitext103"),
|
| 23 |
+
("paws", "labeled_final", "train", "paws"),
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def main():
|
| 28 |
+
for hf_id, config, split, subdir in HF_DATASETS:
|
| 29 |
+
out_path = f"data/raw/hf/{subdir}"
|
| 30 |
+
if os.path.exists(out_path):
|
| 31 |
+
print(f"✓ Already exists: {subdir}")
|
| 32 |
+
continue
|
| 33 |
+
try:
|
| 34 |
+
print(f"Downloading: {hf_id}...")
|
| 35 |
+
if config:
|
| 36 |
+
ds = load_dataset(hf_id, config, split=split)
|
| 37 |
+
else:
|
| 38 |
+
ds = load_dataset(hf_id, split=split)
|
| 39 |
+
ds.save_to_disk(out_path)
|
| 40 |
+
print(f" ✓ Saved to {out_path} ({len(ds)} examples)")
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f" ✗ Failed: {hf_id} — {e}")
|
| 43 |
+
|
| 44 |
+
# Datasets requiring manual action
|
| 45 |
+
MANUAL_DATASETS = {
|
| 46 |
+
"google/clang8": "Requires custom loading script — download manually from HF page",
|
| 47 |
+
"openwebtext": "Very large (40GB) — download separately if needed",
|
| 48 |
+
"W&I+LOCNESS": "✓ Already downloaded (data/raw/wi+locness/)",
|
| 49 |
+
"FCE Corpus": "✓ Already downloaded (data/raw/fce/)",
|
| 50 |
+
"GYAFC": "Unavailable — skipped",
|
| 51 |
+
"Kaggle shanegerami": "Run: bash scripts/download_kaggle_datasets.sh",
|
| 52 |
+
"Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
print("\n── Datasets requiring manual action ──")
|
| 56 |
+
for name, note in MANUAL_DATASETS.items():
|
| 57 |
+
print(f" {name}: {note}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
main()
|
scripts/download_datasets.sh
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Download all training data sources
|
| 3 |
+
# Run: bash scripts/download_datasets.sh
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
mkdir -p data/raw/wi_locness data/raw/jfleg data/raw/gyafc data/raw/custom_dyslexia
|
| 8 |
+
|
| 9 |
+
echo "=== Downloading JFLEG (JHU Fluency-Extended GUG) ==="
|
| 10 |
+
if [ ! -d "data/raw/jfleg_repo" ]; then
|
| 11 |
+
git clone https://github.com/keisks/jfleg.git data/raw/jfleg_repo
|
| 12 |
+
cp data/raw/jfleg_repo/test/*.src data/raw/jfleg/ 2>/dev/null || true
|
| 13 |
+
cp data/raw/jfleg_repo/test/*.ref* data/raw/jfleg/ 2>/dev/null || true
|
| 14 |
+
echo " ✓ JFLEG downloaded"
|
| 15 |
+
else
|
| 16 |
+
echo " ✓ JFLEG already exists"
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
echo ""
|
| 20 |
+
echo "=== Manual Downloads Required ==="
|
| 21 |
+
echo ""
|
| 22 |
+
echo "W&I+LOCNESS (35k pairs, gold standard GEC):"
|
| 23 |
+
echo " → Register at: https://www.cl.cam.ac.uk/research/nl/bea2019st/"
|
| 24 |
+
echo " → Place files in: data/raw/wi_locness/"
|
| 25 |
+
echo ""
|
| 26 |
+
echo "GYAFC (105k pairs, formality transfer):"
|
| 27 |
+
echo " → Request access at: https://github.com/raosudha89/GYAFC-corpus"
|
| 28 |
+
echo " → Place files in: data/raw/gyafc/"
|
| 29 |
+
echo ""
|
| 30 |
+
echo "=== Dataset download complete ==="
|
| 31 |
+
echo "Check manually downloaded datasets before proceeding."
|
scripts/download_kaggle_datasets.sh
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Download Kaggle datasets for Human-Pattern Anti-AI training
|
| 3 |
+
# Requires: pip install kaggle
|
| 4 |
+
# Setup: Place kaggle.json API key at ~/.kaggle/kaggle.json
|
| 5 |
+
# Get key: kaggle.com → Account → Create New API Token
|
| 6 |
+
#
|
| 7 |
+
# Run: bash scripts/download_kaggle_datasets.sh
|
| 8 |
+
|
| 9 |
+
set -e
|
| 10 |
+
|
| 11 |
+
mkdir -p data/raw/shanegerami data/raw/starblasters8
|
| 12 |
+
|
| 13 |
+
echo "=== Downloading Kaggle Datasets ==="
|
| 14 |
+
echo ""
|
| 15 |
+
|
| 16 |
+
# Dataset 1: AI vs Human Text (500K essays)
|
| 17 |
+
echo "Downloading: shanegerami/ai-vs-human-text..."
|
| 18 |
+
if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then
|
| 19 |
+
kaggle datasets download -d shanegerami/ai-vs-human-text \
|
| 20 |
+
-p data/raw/shanegerami --unzip
|
| 21 |
+
echo " ✓ Dataset 1 downloaded"
|
| 22 |
+
else
|
| 23 |
+
echo " ✓ Dataset 1 already exists"
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
echo ""
|
| 27 |
+
|
| 28 |
+
# Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs)
|
| 29 |
+
echo "Downloading: starblasters8/human-vs-llm-text-corpus..."
|
| 30 |
+
if [ ! -f "data/raw/starblasters8/data.parquet" ]; then
|
| 31 |
+
kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \
|
| 32 |
+
-p data/raw/starblasters8 --unzip
|
| 33 |
+
echo " ✓ Dataset 2 downloaded"
|
| 34 |
+
else
|
| 35 |
+
echo " ✓ Dataset 2 already exists"
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
echo ""
|
| 39 |
+
echo "=== Kaggle datasets download complete ==="
|
| 40 |
+
echo "Dataset 1 (CSV): data/raw/shanegerami/train_essays.csv"
|
| 41 |
+
echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet"
|
scripts/evaluate.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation script.
|
| 3 |
+
Runs all evaluation metrics on the test set.
|
| 4 |
+
Run: python scripts/evaluate.py --config configs/training_config.yaml --split test
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import click
|
| 8 |
+
import yaml
|
| 9 |
+
import json
|
| 10 |
+
import torch
|
| 11 |
+
from loguru import logger
|
| 12 |
+
from rich.console import Console
|
| 13 |
+
from rich.table import Table
|
| 14 |
+
|
| 15 |
+
from src.model.base_model import load_model_and_tokenizer
|
| 16 |
+
from src.model.generation_utils import batch_generate
|
| 17 |
+
from src.evaluation.gleu_scorer import GLEUScorer
|
| 18 |
+
from src.evaluation.errant_evaluator import ERRANTEvaluator
|
| 19 |
+
from src.evaluation.style_metrics import StyleEvaluator
|
| 20 |
+
from src.style.fingerprinter import StyleFingerprinter
|
| 21 |
+
from src.vocabulary.awl_loader import AWLLoader
|
| 22 |
+
|
| 23 |
+
console = Console()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@click.command()
|
| 27 |
+
@click.option("--config", default="configs/training_config.yaml")
|
| 28 |
+
@click.option("--split", default="test")
|
| 29 |
+
@click.option("--max-samples", default=100, help="Max samples to evaluate")
|
| 30 |
+
def evaluate(config: str, split: str, max_samples: int):
|
| 31 |
+
"""Run evaluation on the specified data split."""
|
| 32 |
+
with open(config) as f:
|
| 33 |
+
cfg = yaml.safe_load(f)
|
| 34 |
+
|
| 35 |
+
model_cfg = cfg.get("model", {})
|
| 36 |
+
gen_cfg = cfg.get("generation", {})
|
| 37 |
+
|
| 38 |
+
checkpoint = "checkpoints/best_model"
|
| 39 |
+
try:
|
| 40 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 41 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
| 42 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
| 43 |
+
except Exception:
|
| 44 |
+
model, tokenizer, _ = load_model_and_tokenizer(model_cfg.get("key", "flan-t5-large"), quantize=False, use_lora=False)
|
| 45 |
+
model.eval()
|
| 46 |
+
|
| 47 |
+
data_path = cfg.get("data", {}).get(f"{split}_path", f"data/processed/{split}.jsonl")
|
| 48 |
+
sources, references = [], []
|
| 49 |
+
with open(data_path) as f:
|
| 50 |
+
for i, line in enumerate(f):
|
| 51 |
+
if i >= max_samples:
|
| 52 |
+
break
|
| 53 |
+
obj = json.loads(line.strip())
|
| 54 |
+
sources.append(obj["input"])
|
| 55 |
+
references.append(obj["target"])
|
| 56 |
+
|
| 57 |
+
prefix = "Correct the following text for grammar, spelling, and clarity. Text to correct: "
|
| 58 |
+
predictions = batch_generate(model, tokenizer, [prefix + s for s in sources], gen_cfg)
|
| 59 |
+
|
| 60 |
+
gleu_scorer = GLEUScorer()
|
| 61 |
+
gleu = gleu_scorer.compute_gleu(predictions, references)
|
| 62 |
+
bert_p, bert_r, bert_f1 = gleu_scorer.compute_bert_score(predictions, references)
|
| 63 |
+
|
| 64 |
+
errant_scores = ERRANTEvaluator().evaluate(sources, predictions, references)
|
| 65 |
+
|
| 66 |
+
fp = StyleFingerprinter(spacy_model="en_core_web_sm")
|
| 67 |
+
style_scores = StyleEvaluator(fp, AWLLoader()).evaluate_batch(sources, predictions, references)
|
| 68 |
+
|
| 69 |
+
table = Table(title=f"Evaluation ({split}, {len(sources)} samples)")
|
| 70 |
+
table.add_column("Metric", style="cyan")
|
| 71 |
+
table.add_column("Score", style="green")
|
| 72 |
+
table.add_row("GLEU", f"{gleu:.2f}")
|
| 73 |
+
table.add_row("BERTScore F1", f"{bert_f1:.4f}")
|
| 74 |
+
table.add_row("ERRANT F0.5", f"{errant_scores['f0.5']:.4f}")
|
| 75 |
+
table.add_row("Style Similarity", f"{style_scores['style_similarity_mean']:.4f}")
|
| 76 |
+
table.add_row("AWL Coverage", f"{style_scores['awl_coverage_mean']:.4f}")
|
| 77 |
+
console.print(table)
|
| 78 |
+
|
| 79 |
+
results = {"gleu": gleu, "bert_f1": bert_f1, "errant": errant_scores, "style": style_scores}
|
| 80 |
+
with open(f"logs/eval_results_{split}.json", "w") as f:
|
| 81 |
+
json.dump(results, f, indent=2)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
evaluate()
|
scripts/preprocess_data.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Converts all raw dataset formats into unified JSONL training format.
|
| 3 |
+
Output schema per line:
|
| 4 |
+
{"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"}
|
| 5 |
+
|
| 6 |
+
Datasets handled:
|
| 7 |
+
- FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json
|
| 8 |
+
- W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json
|
| 9 |
+
- JFLEG: data/raw/jfleg/*.src + *.ref*
|
| 10 |
+
|
| 11 |
+
Run: python scripts/preprocess_data.py
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def apply_bea19_edits(text: str, edits_block: list) -> str:
|
| 20 |
+
"""
|
| 21 |
+
Apply BEA-2019 character-level edits to produce corrected text.
|
| 22 |
+
|
| 23 |
+
edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]]
|
| 24 |
+
We use the first annotator's corrections.
|
| 25 |
+
Edits are applied in reverse order to preserve character offsets.
|
| 26 |
+
"""
|
| 27 |
+
if not edits_block or len(edits_block) == 0:
|
| 28 |
+
return text
|
| 29 |
+
|
| 30 |
+
# Take first annotator's edits
|
| 31 |
+
annotator_edits = edits_block[0][1]
|
| 32 |
+
|
| 33 |
+
# Sort by start position descending to apply from end to preserve offsets
|
| 34 |
+
sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True)
|
| 35 |
+
|
| 36 |
+
result = text
|
| 37 |
+
for edit in sorted_edits:
|
| 38 |
+
start = edit[0]
|
| 39 |
+
end = edit[1]
|
| 40 |
+
replacement = edit[2]
|
| 41 |
+
|
| 42 |
+
# Skip null replacements (no correction needed) and noop edits
|
| 43 |
+
if replacement is None:
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
result = result[:start] + replacement + result[end:]
|
| 47 |
+
|
| 48 |
+
return result
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def process_bea19_json(json_path: str, source_name: str, out_file):
|
| 52 |
+
"""
|
| 53 |
+
Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
|
| 54 |
+
Each line is a JSON object with 'text' and 'edits' fields.
|
| 55 |
+
Produces (input=original, target=corrected) pairs.
|
| 56 |
+
"""
|
| 57 |
+
count = 0
|
| 58 |
+
with open(json_path) as f:
|
| 59 |
+
for line in f:
|
| 60 |
+
line = line.strip()
|
| 61 |
+
if not line:
|
| 62 |
+
continue
|
| 63 |
+
obj = json.loads(line)
|
| 64 |
+
original = obj["text"]
|
| 65 |
+
edits = obj.get("edits", [])
|
| 66 |
+
corrected = apply_bea19_edits(original, edits)
|
| 67 |
+
|
| 68 |
+
# Only include if there were actual corrections
|
| 69 |
+
if original.strip() != corrected.strip() and corrected.strip():
|
| 70 |
+
out_file.write(json.dumps({
|
| 71 |
+
"input": original,
|
| 72 |
+
"target": corrected,
|
| 73 |
+
"source": source_name,
|
| 74 |
+
}) + "\n")
|
| 75 |
+
count += 1
|
| 76 |
+
return count
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def process_fce(raw_dir: str, out_file) -> int:
|
| 80 |
+
"""Process all FCE JSON files."""
|
| 81 |
+
total = 0
|
| 82 |
+
json_dir = Path(raw_dir) / "json"
|
| 83 |
+
if not json_dir.exists():
|
| 84 |
+
print(f" ⚠ FCE directory not found: {json_dir}")
|
| 85 |
+
return 0
|
| 86 |
+
for json_file in sorted(json_dir.glob("*.json")):
|
| 87 |
+
n = process_bea19_json(str(json_file), "fce", out_file)
|
| 88 |
+
print(f" {json_file.name}: {n} pairs")
|
| 89 |
+
total += n
|
| 90 |
+
return total
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def process_wi_locness(raw_dir: str, out_file) -> int:
|
| 94 |
+
"""Process all W&I+LOCNESS JSON files."""
|
| 95 |
+
total = 0
|
| 96 |
+
json_dir = Path(raw_dir) / "json"
|
| 97 |
+
if not json_dir.exists():
|
| 98 |
+
print(f" ⚠ W&I+LOCNESS directory not found: {json_dir}")
|
| 99 |
+
return 0
|
| 100 |
+
for json_file in sorted(json_dir.glob("*.json")):
|
| 101 |
+
n = process_bea19_json(str(json_file), "wi_locness", out_file)
|
| 102 |
+
print(f" {json_file.name}: {n} pairs")
|
| 103 |
+
total += n
|
| 104 |
+
return total
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def process_jfleg(raw_dir: str, out_file) -> int:
|
| 108 |
+
"""
|
| 109 |
+
JFLEG: .src files (original) and .ref0..ref3 (4 human corrections).
|
| 110 |
+
Each reference becomes a separate training pair.
|
| 111 |
+
"""
|
| 112 |
+
total = 0
|
| 113 |
+
src_files = list(Path(raw_dir).glob("*.src"))
|
| 114 |
+
if not src_files:
|
| 115 |
+
print(f" ⚠ JFLEG directory empty or not found: {raw_dir}")
|
| 116 |
+
return 0
|
| 117 |
+
for src_file in src_files:
|
| 118 |
+
refs = [src_file.with_suffix(f".ref{i}") for i in range(4)]
|
| 119 |
+
with open(src_file) as sf:
|
| 120 |
+
src_lines = sf.readlines()
|
| 121 |
+
for ref_path in refs:
|
| 122 |
+
if ref_path.exists():
|
| 123 |
+
with open(ref_path) as rf:
|
| 124 |
+
ref_lines = rf.readlines()
|
| 125 |
+
for src, ref in zip(src_lines, ref_lines):
|
| 126 |
+
src, ref = src.strip(), ref.strip()
|
| 127 |
+
if src and ref and src != ref:
|
| 128 |
+
out_file.write(json.dumps({
|
| 129 |
+
"input": src,
|
| 130 |
+
"target": ref,
|
| 131 |
+
"source": "jfleg",
|
| 132 |
+
}) + "\n")
|
| 133 |
+
total += 1
|
| 134 |
+
return total
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def create_splits(train_path: str, val_ratio: float = 0.1):
|
| 138 |
+
"""Split train.jsonl into train and val sets."""
|
| 139 |
+
import random
|
| 140 |
+
random.seed(42)
|
| 141 |
+
|
| 142 |
+
with open(train_path) as f:
|
| 143 |
+
lines = f.readlines()
|
| 144 |
+
|
| 145 |
+
random.shuffle(lines)
|
| 146 |
+
val_size = int(len(lines) * val_ratio)
|
| 147 |
+
val_lines = lines[:val_size]
|
| 148 |
+
train_lines = lines[val_size:]
|
| 149 |
+
|
| 150 |
+
with open(train_path, "w") as f:
|
| 151 |
+
f.writelines(train_lines)
|
| 152 |
+
|
| 153 |
+
val_path = train_path.replace("train.jsonl", "val.jsonl")
|
| 154 |
+
with open(val_path, "w") as f:
|
| 155 |
+
f.writelines(val_lines)
|
| 156 |
+
|
| 157 |
+
# Also create a small test split from val
|
| 158 |
+
test_size = min(len(val_lines) // 2, 500)
|
| 159 |
+
test_lines = val_lines[:test_size]
|
| 160 |
+
test_path = train_path.replace("train.jsonl", "test.jsonl")
|
| 161 |
+
with open(test_path, "w") as f:
|
| 162 |
+
f.writelines(test_lines)
|
| 163 |
+
|
| 164 |
+
return len(train_lines), len(val_lines), len(test_lines)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def main():
|
| 168 |
+
os.makedirs("data/processed", exist_ok=True)
|
| 169 |
+
|
| 170 |
+
print("=== Preprocessing datasets into unified JSONL ===\n")
|
| 171 |
+
total = 0
|
| 172 |
+
|
| 173 |
+
with open("data/processed/train.jsonl", "w") as out:
|
| 174 |
+
# FCE
|
| 175 |
+
print("Processing FCE...")
|
| 176 |
+
n = process_fce("data/raw/fce", out)
|
| 177 |
+
print(f" Total FCE: {n} pairs\n")
|
| 178 |
+
total += n
|
| 179 |
+
|
| 180 |
+
# W&I+LOCNESS
|
| 181 |
+
print("Processing W&I+LOCNESS...")
|
| 182 |
+
n = process_wi_locness("data/raw/wi+locness", out)
|
| 183 |
+
print(f" Total W&I+LOCNESS: {n} pairs\n")
|
| 184 |
+
total += n
|
| 185 |
+
|
| 186 |
+
# JFLEG
|
| 187 |
+
print("Processing JFLEG...")
|
| 188 |
+
n = process_jfleg("data/raw/jfleg", out)
|
| 189 |
+
print(f" Total JFLEG: {n} pairs\n")
|
| 190 |
+
total += n
|
| 191 |
+
|
| 192 |
+
print(f"Total examples in train.jsonl: {total}")
|
| 193 |
+
|
| 194 |
+
# Create train/val/test splits
|
| 195 |
+
print("\nSplitting into train/val/test...")
|
| 196 |
+
n_train, n_val, n_test = create_splits("data/processed/train.jsonl")
|
| 197 |
+
print(f" Train: {n_train} | Val: {n_val} | Test: {n_test}")
|
| 198 |
+
|
| 199 |
+
print("\n✓ Preprocessing complete.")
|
| 200 |
+
print(" data/processed/train.jsonl")
|
| 201 |
+
print(" data/processed/val.jsonl")
|
| 202 |
+
print(" data/processed/test.jsonl")
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
if __name__ == "__main__":
|
| 206 |
+
main()
|
scripts/pretrain_human_pattern_classifier.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pre-trains the HumanPatternClassifier on both Kaggle datasets.
|
| 3 |
+
Run this BEFORE the main training loop.
|
| 4 |
+
The saved classifier weights are then loaded frozen during main training.
|
| 5 |
+
|
| 6 |
+
Run: python scripts/pretrain_human_pattern_classifier.py
|
| 7 |
+
Output: checkpoints/human_pattern_classifier.pt
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn as nn
|
| 12 |
+
from torch.utils.data import DataLoader, random_split
|
| 13 |
+
from sklearn.metrics import accuracy_score, roc_auc_score
|
| 14 |
+
import numpy as np
|
| 15 |
+
from loguru import logger
|
| 16 |
+
import os
|
| 17 |
+
import yaml
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
import wandb
|
| 21 |
+
HAS_WANDB = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
HAS_WANDB = False
|
| 24 |
+
|
| 25 |
+
from src.training.human_pattern_extractor import (
|
| 26 |
+
HumanPatternFeatureExtractor,
|
| 27 |
+
KaggleHumanPatternDataset,
|
| 28 |
+
HumanPatternClassifier,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def train_classifier(config_path: str = "configs/training_config.yaml"):
|
| 33 |
+
"""Pre-train the human pattern classifier on Kaggle datasets."""
|
| 34 |
+
# Load config
|
| 35 |
+
with open(config_path) as f:
|
| 36 |
+
config = yaml.safe_load(f)
|
| 37 |
+
|
| 38 |
+
hp_cfg = config.get("human_pattern", {})
|
| 39 |
+
|
| 40 |
+
# Init W&B (optional)
|
| 41 |
+
if HAS_WANDB and os.environ.get("WANDB_API_KEY"):
|
| 42 |
+
wandb.init(project="dyslexia-rewriter", name="human-pattern-pretrain", tags=["pretrain"])
|
| 43 |
+
else:
|
| 44 |
+
logger.info("W&B not configured, logging to console only")
|
| 45 |
+
|
| 46 |
+
# Create extractor
|
| 47 |
+
logger.info("Creating feature extractor...")
|
| 48 |
+
extractor = HumanPatternFeatureExtractor(spacy_model="en_core_web_sm")
|
| 49 |
+
|
| 50 |
+
# Load datasets
|
| 51 |
+
shanegerami_path = hp_cfg.get("shanegerami_path", "data/raw/shanegerami/AI_Human.csv")
|
| 52 |
+
starblasters_path = hp_cfg.get("starblasters_path", "data/raw/starblasters8/data.parquet")
|
| 53 |
+
max_samples = hp_cfg.get("max_samples_per_source", 50000)
|
| 54 |
+
|
| 55 |
+
logger.info("Loading datasets...")
|
| 56 |
+
dataset = KaggleHumanPatternDataset(
|
| 57 |
+
shanegerami_path=shanegerami_path,
|
| 58 |
+
starblasters_path=starblasters_path,
|
| 59 |
+
extractor=extractor,
|
| 60 |
+
max_samples_per_source=max_samples,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
if len(dataset) == 0:
|
| 64 |
+
logger.error("No data loaded! Check dataset paths.")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
# Pre-compute features
|
| 68 |
+
dataset.precompute_features()
|
| 69 |
+
|
| 70 |
+
# Train/val split (80/20)
|
| 71 |
+
val_size = int(len(dataset) * 0.2)
|
| 72 |
+
train_size = len(dataset) - val_size
|
| 73 |
+
train_dataset, val_dataset = random_split(
|
| 74 |
+
dataset,
|
| 75 |
+
[train_size, val_size],
|
| 76 |
+
generator=torch.Generator().manual_seed(42),
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Create dataloaders
|
| 80 |
+
batch_size = hp_cfg.get("pretrain_batch_size", 512)
|
| 81 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
|
| 82 |
+
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
|
| 83 |
+
|
| 84 |
+
logger.info(f"Train: {train_size} | Val: {val_size} | Batch size: {batch_size}")
|
| 85 |
+
|
| 86 |
+
# Create model
|
| 87 |
+
classifier = HumanPatternClassifier(input_dim=17, hidden_dim=128)
|
| 88 |
+
device = "cpu"
|
| 89 |
+
classifier = classifier.to(device)
|
| 90 |
+
|
| 91 |
+
# Training setup
|
| 92 |
+
epochs = hp_cfg.get("pretrain_epochs", 20)
|
| 93 |
+
lr = hp_cfg.get("pretrain_lr", 1e-3)
|
| 94 |
+
target_auc = hp_cfg.get("target_auc", 0.88)
|
| 95 |
+
|
| 96 |
+
optimizer = torch.optim.AdamW(classifier.parameters(), lr=lr, weight_decay=1e-4)
|
| 97 |
+
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
|
| 98 |
+
criterion = nn.BCELoss()
|
| 99 |
+
|
| 100 |
+
best_auc = 0.0
|
| 101 |
+
os.makedirs("checkpoints", exist_ok=True)
|
| 102 |
+
|
| 103 |
+
# Training loop
|
| 104 |
+
for epoch in range(1, epochs + 1):
|
| 105 |
+
classifier.train()
|
| 106 |
+
train_loss = 0.0
|
| 107 |
+
train_preds = []
|
| 108 |
+
train_labels = []
|
| 109 |
+
|
| 110 |
+
for features, labels in train_loader:
|
| 111 |
+
features = features.to(device)
|
| 112 |
+
labels = labels.float().to(device)
|
| 113 |
+
|
| 114 |
+
optimizer.zero_grad()
|
| 115 |
+
outputs = classifier(features)
|
| 116 |
+
loss = criterion(outputs, labels)
|
| 117 |
+
loss.backward()
|
| 118 |
+
|
| 119 |
+
# Gradient clipping for stability
|
| 120 |
+
torch.nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)
|
| 121 |
+
|
| 122 |
+
optimizer.step()
|
| 123 |
+
|
| 124 |
+
train_loss += loss.item() * features.size(0)
|
| 125 |
+
train_preds.extend(outputs.detach().cpu().numpy())
|
| 126 |
+
train_labels.extend(labels.cpu().numpy())
|
| 127 |
+
|
| 128 |
+
scheduler.step()
|
| 129 |
+
train_loss /= train_size
|
| 130 |
+
|
| 131 |
+
# Validation
|
| 132 |
+
classifier.eval()
|
| 133 |
+
val_preds = []
|
| 134 |
+
val_labels = []
|
| 135 |
+
val_loss = 0.0
|
| 136 |
+
|
| 137 |
+
with torch.no_grad():
|
| 138 |
+
for features, labels in val_loader:
|
| 139 |
+
features = features.to(device)
|
| 140 |
+
labels = labels.float().to(device)
|
| 141 |
+
outputs = classifier(features)
|
| 142 |
+
loss = criterion(outputs, labels)
|
| 143 |
+
val_loss += loss.item() * features.size(0)
|
| 144 |
+
val_preds.extend(outputs.cpu().numpy())
|
| 145 |
+
val_labels.extend(labels.cpu().numpy())
|
| 146 |
+
|
| 147 |
+
val_loss /= val_size
|
| 148 |
+
|
| 149 |
+
# Metrics
|
| 150 |
+
train_preds_binary = [1 if p > 0.5 else 0 for p in train_preds]
|
| 151 |
+
val_preds_binary = [1 if p > 0.5 else 0 for p in val_preds]
|
| 152 |
+
|
| 153 |
+
train_acc = accuracy_score(train_labels, train_preds_binary)
|
| 154 |
+
val_acc = accuracy_score(val_labels, val_preds_binary)
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
train_auc = roc_auc_score(train_labels, train_preds)
|
| 158 |
+
val_auc = roc_auc_score(val_labels, val_preds)
|
| 159 |
+
except ValueError:
|
| 160 |
+
train_auc = 0.0
|
| 161 |
+
val_auc = 0.0
|
| 162 |
+
|
| 163 |
+
logger.info(
|
| 164 |
+
f"Epoch {epoch}/{epochs} | "
|
| 165 |
+
f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} AUC: {train_auc:.4f} | "
|
| 166 |
+
f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} AUC: {val_auc:.4f}"
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Log to W&B
|
| 170 |
+
if HAS_WANDB and wandb.run is not None:
|
| 171 |
+
wandb.log({
|
| 172 |
+
"epoch": epoch,
|
| 173 |
+
"train/loss": train_loss,
|
| 174 |
+
"train/accuracy": train_acc,
|
| 175 |
+
"train/auc": train_auc,
|
| 176 |
+
"val/loss": val_loss,
|
| 177 |
+
"val/accuracy": val_acc,
|
| 178 |
+
"val/auc": val_auc,
|
| 179 |
+
"lr": scheduler.get_last_lr()[0],
|
| 180 |
+
})
|
| 181 |
+
|
| 182 |
+
# Save best model by AUC
|
| 183 |
+
if val_auc > best_auc:
|
| 184 |
+
best_auc = val_auc
|
| 185 |
+
save_path = hp_cfg.get("classifier_path", "checkpoints/human_pattern_classifier.pt")
|
| 186 |
+
torch.save(classifier.state_dict(), save_path)
|
| 187 |
+
logger.info(f" ✓ New best AUC: {val_auc:.4f} — saved to {save_path}")
|
| 188 |
+
|
| 189 |
+
# Early stopping if target AUC reached
|
| 190 |
+
if val_auc >= target_auc:
|
| 191 |
+
logger.info(f"Target AUC {target_auc} reached at epoch {epoch}! Stopping.")
|
| 192 |
+
break
|
| 193 |
+
|
| 194 |
+
logger.info(f"\nPre-training complete. Best AUC: {best_auc:.4f}")
|
| 195 |
+
|
| 196 |
+
if HAS_WANDB and wandb.run is not None:
|
| 197 |
+
wandb.finish()
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
train_classifier()
|
scripts/run_inference.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Interactive inference script.
|
| 3 |
+
Run: python scripts/run_inference.py --config configs/inference_config.yaml
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import click
|
| 7 |
+
import yaml
|
| 8 |
+
from rich.console import Console
|
| 9 |
+
from rich.panel import Panel
|
| 10 |
+
from rich.table import Table
|
| 11 |
+
from src.inference.corrector import AcademicCorrector
|
| 12 |
+
|
| 13 |
+
console = Console()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@click.command()
|
| 17 |
+
@click.option("--config", default="configs/inference_config.yaml")
|
| 18 |
+
@click.option("--text", default=None, help="Text to correct")
|
| 19 |
+
@click.option("--master-copy", default=None, help="Optional master copy for style matching")
|
| 20 |
+
@click.option("--style-alpha", default=0.6, help="Style blend weight (0=master, 1=user)")
|
| 21 |
+
def run_inference(config: str, text: str, master_copy: str, style_alpha: float):
|
| 22 |
+
"""Run inference on text input."""
|
| 23 |
+
with open(config) as f:
|
| 24 |
+
cfg = yaml.safe_load(f)
|
| 25 |
+
|
| 26 |
+
console.print("[bold cyan]Loading model...[/]")
|
| 27 |
+
corrector = AcademicCorrector(cfg)
|
| 28 |
+
console.print("[bold green]✓ Model loaded[/]")
|
| 29 |
+
|
| 30 |
+
if text:
|
| 31 |
+
result = corrector.correct(text, master_copy=master_copy, style_alpha=style_alpha)
|
| 32 |
+
console.print(Panel(result.original, title="Original", border_style="red"))
|
| 33 |
+
console.print(Panel(result.corrected, title="Corrected", border_style="green"))
|
| 34 |
+
table = Table(title="Metrics")
|
| 35 |
+
table.add_column("Metric")
|
| 36 |
+
table.add_column("Value")
|
| 37 |
+
table.add_row("Style Similarity", f"{result.style_similarity:.4f}")
|
| 38 |
+
table.add_row("AWL Coverage", f"{result.awl_coverage:.4f}")
|
| 39 |
+
for k, v in result.readability.items():
|
| 40 |
+
table.add_row(k, f"{v:.2f}")
|
| 41 |
+
console.print(table)
|
| 42 |
+
else:
|
| 43 |
+
console.print("[bold yellow]Interactive mode. Type text to correct (Ctrl+C to exit).[/]")
|
| 44 |
+
while True:
|
| 45 |
+
try:
|
| 46 |
+
console.print()
|
| 47 |
+
user_input = console.input("[bold cyan]Enter text: [/]")
|
| 48 |
+
if not user_input.strip():
|
| 49 |
+
continue
|
| 50 |
+
result = corrector.correct(user_input, style_alpha=style_alpha)
|
| 51 |
+
console.print(Panel(result.corrected, title="Corrected", border_style="green"))
|
| 52 |
+
console.print(f" Style: {result.style_similarity:.3f} | AWL: {result.awl_coverage:.3f}")
|
| 53 |
+
except KeyboardInterrupt:
|
| 54 |
+
console.print("\n[bold red]Goodbye![/]")
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
run_inference()
|
scripts/train.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Full training entry point.
|
| 3 |
+
Run: python scripts/train.py --config configs/training_config.yaml
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import click
|
| 7 |
+
import yaml
|
| 8 |
+
import torch
|
| 9 |
+
import os
|
| 10 |
+
import gc
|
| 11 |
+
from transformers import TrainingArguments, Seq2SeqTrainingArguments
|
| 12 |
+
from loguru import logger
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
import wandb
|
| 16 |
+
HAS_WANDB = True
|
| 17 |
+
except ImportError:
|
| 18 |
+
HAS_WANDB = False
|
| 19 |
+
|
| 20 |
+
from src.model.base_model import load_model_and_tokenizer
|
| 21 |
+
from src.model.style_conditioner import StyleConditioner
|
| 22 |
+
from src.training.dataset import WritingCorrectionDataset
|
| 23 |
+
from src.training.loss_functions import CombinedCorrectionLoss, CombinedCorrectionLossV2
|
| 24 |
+
from src.training.trainer import CorrectionTrainer
|
| 25 |
+
from src.training.callbacks import StyleMetricsCallback, EarlyStoppingOnStyleDrift
|
| 26 |
+
from src.style.fingerprinter import StyleFingerprinter
|
| 27 |
+
from src.evaluation.gleu_scorer import GLEUScorer
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ── Hybrid GPU Management ───────────────────────────────────────────────────
|
| 31 |
+
def _setup_device():
|
| 32 |
+
"""Detect GPU and configure hybrid VRAM management.
|
| 33 |
+
|
| 34 |
+
Returns (device, gpu_info) where gpu_info is a dict with:
|
| 35 |
+
- available: bool
|
| 36 |
+
- name: str
|
| 37 |
+
- vram_total_mb: int
|
| 38 |
+
- vram_free_mb: int
|
| 39 |
+
- compute_cap: tuple
|
| 40 |
+
"""
|
| 41 |
+
gpu_info = {"available": False, "name": "CPU", "vram_total_mb": 0,
|
| 42 |
+
"vram_free_mb": 0, "compute_cap": (0, 0)}
|
| 43 |
+
|
| 44 |
+
if not torch.cuda.is_available():
|
| 45 |
+
logger.info("No GPU detected — training on CPU")
|
| 46 |
+
return "cpu", gpu_info
|
| 47 |
+
|
| 48 |
+
gpu_info["available"] = True
|
| 49 |
+
gpu_info["name"] = torch.cuda.get_device_name(0)
|
| 50 |
+
gpu_info["compute_cap"] = torch.cuda.get_device_capability(0)
|
| 51 |
+
|
| 52 |
+
# Query actual free VRAM
|
| 53 |
+
vram_total = torch.cuda.get_device_properties(0).total_memory // (1024 * 1024)
|
| 54 |
+
vram_reserved = torch.cuda.memory_reserved(0) // (1024 * 1024)
|
| 55 |
+
vram_allocated = torch.cuda.memory_allocated(0) // (1024 * 1024)
|
| 56 |
+
vram_free = vram_total - vram_allocated
|
| 57 |
+
|
| 58 |
+
gpu_info["vram_total_mb"] = vram_total
|
| 59 |
+
gpu_info["vram_free_mb"] = vram_free
|
| 60 |
+
|
| 61 |
+
logger.info(
|
| 62 |
+
f"GPU: {gpu_info['name']} | "
|
| 63 |
+
f"VRAM: {vram_allocated}MB used / {vram_total}MB total ({vram_free}MB free) | "
|
| 64 |
+
f"Compute: {gpu_info['compute_cap']}"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Leave headroom for the system — reserve at most 85% of free VRAM
|
| 68 |
+
# This prevents the desktop/compositor from starving
|
| 69 |
+
usable_vram_mb = int(vram_free * 0.85)
|
| 70 |
+
if usable_vram_mb > 0:
|
| 71 |
+
# Set PyTorch memory limit to avoid hogging all VRAM
|
| 72 |
+
fraction = min(usable_vram_mb / vram_total, 0.90)
|
| 73 |
+
torch.cuda.set_per_process_memory_fraction(fraction, 0)
|
| 74 |
+
logger.info(
|
| 75 |
+
f"Hybrid GPU mode: capped PyTorch VRAM to {fraction:.0%} "
|
| 76 |
+
f"(~{int(vram_total * fraction)}MB), leaving room for system"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
return "cuda", gpu_info
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _auto_batch_size(model_key: str, device: str, gpu_info: dict,
|
| 83 |
+
config_batch: int) -> int:
|
| 84 |
+
"""Pick optimal batch size based on model size and available resources."""
|
| 85 |
+
if device == "cpu":
|
| 86 |
+
# CPU: T5-Small can handle batch=8 with 32GB RAM, larger models less
|
| 87 |
+
if "small" in model_key:
|
| 88 |
+
return min(config_batch, 8)
|
| 89 |
+
return min(config_batch, 2)
|
| 90 |
+
|
| 91 |
+
# GPU: estimate based on free VRAM
|
| 92 |
+
free_mb = gpu_info["vram_free_mb"]
|
| 93 |
+
|
| 94 |
+
# Rough VRAM per sample estimates (bf16, seq_len=128):
|
| 95 |
+
# T5-Small: ~120MB model + ~50MB/sample
|
| 96 |
+
# T5-Base: ~350MB model + ~90MB/sample
|
| 97 |
+
# T5-Large: ~900MB model + ~150MB/sample
|
| 98 |
+
model_vram_estimates = {
|
| 99 |
+
"flan-t5-small": {"model_mb": 160, "per_sample_mb": 60},
|
| 100 |
+
"flan-t5-base": {"model_mb": 400, "per_sample_mb": 100},
|
| 101 |
+
"flan-t5-large": {"model_mb": 1000, "per_sample_mb": 160},
|
| 102 |
+
"flan-t5-xl": {"model_mb": 3000, "per_sample_mb": 300},
|
| 103 |
+
}
|
| 104 |
+
est = model_vram_estimates.get(model_key, {"model_mb": 500, "per_sample_mb": 120})
|
| 105 |
+
|
| 106 |
+
# Available for batches = free VRAM - model footprint - 300MB safety buffer
|
| 107 |
+
available_for_batches = free_mb - est["model_mb"] - 300
|
| 108 |
+
if available_for_batches <= 0:
|
| 109 |
+
logger.warning("Very tight VRAM — using batch_size=1")
|
| 110 |
+
return 1
|
| 111 |
+
|
| 112 |
+
max_batch = max(1, available_for_batches // est["per_sample_mb"])
|
| 113 |
+
optimal = min(config_batch, max_batch)
|
| 114 |
+
|
| 115 |
+
logger.info(
|
| 116 |
+
f"Auto batch size: {optimal} "
|
| 117 |
+
f"(model ~{est['model_mb']}MB + {optimal}×{est['per_sample_mb']}MB "
|
| 118 |
+
f"= ~{est['model_mb'] + optimal * est['per_sample_mb']}MB / {free_mb}MB free)"
|
| 119 |
+
)
|
| 120 |
+
return max(1, optimal)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@click.command()
|
| 124 |
+
@click.option("--config", default="configs/training_config.yaml")
|
| 125 |
+
@click.option("--use-v2-loss", is_flag=True, help="Use V2 loss with human pattern term")
|
| 126 |
+
def train(config: str, use_v2_loss: bool):
|
| 127 |
+
"""Launch the full training pipeline."""
|
| 128 |
+
# Step 1: Load config
|
| 129 |
+
logger.info("Step 1: Loading config...")
|
| 130 |
+
with open(config) as f:
|
| 131 |
+
cfg = yaml.safe_load(f)
|
| 132 |
+
|
| 133 |
+
model_cfg = cfg.get("model", {})
|
| 134 |
+
lora_cfg = cfg.get("lora", {})
|
| 135 |
+
data_cfg = cfg.get("data", {})
|
| 136 |
+
train_cfg = cfg.get("training", {})
|
| 137 |
+
loss_cfg = cfg.get("loss", {})
|
| 138 |
+
gen_cfg = cfg.get("generation", {})
|
| 139 |
+
|
| 140 |
+
# Step 2: Initialise W&B (optional)
|
| 141 |
+
logger.info("Step 2: Initialising experiment tracking...")
|
| 142 |
+
if HAS_WANDB and os.environ.get("WANDB_API_KEY"):
|
| 143 |
+
wandb.init(
|
| 144 |
+
project="dyslexia-rewriter",
|
| 145 |
+
name=f"train-{model_cfg.get('key', 'flan-t5')}",
|
| 146 |
+
config=cfg,
|
| 147 |
+
)
|
| 148 |
+
else:
|
| 149 |
+
logger.info("W&B not configured, logging to TensorBoard only")
|
| 150 |
+
os.environ["WANDB_DISABLED"] = "true"
|
| 151 |
+
|
| 152 |
+
# Step 3: Detect GPU and configure hybrid VRAM management
|
| 153 |
+
logger.info("Step 3: Setting up device (hybrid GPU mode)...")
|
| 154 |
+
device, gpu_info = _setup_device()
|
| 155 |
+
|
| 156 |
+
# Step 4: Load model + tokenizer
|
| 157 |
+
logger.info("Step 4: Loading model and tokenizer...")
|
| 158 |
+
model_key = model_cfg.get("key", "flan-t5-small")
|
| 159 |
+
model, tokenizer, is_seq2seq = load_model_and_tokenizer(
|
| 160 |
+
model_key=model_key,
|
| 161 |
+
quantize=model_cfg.get("quantize", False),
|
| 162 |
+
use_lora=model_cfg.get("use_lora", True),
|
| 163 |
+
lora_config_dict=lora_cfg,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Required for PEFT + gradient checkpointing compatibility
|
| 167 |
+
if hasattr(model, 'enable_input_require_grads'):
|
| 168 |
+
model.enable_input_require_grads()
|
| 169 |
+
|
| 170 |
+
# ── torch.compile for fused kernels (PyTorch 2.x) ───────────────────────
|
| 171 |
+
if hasattr(torch, "compile") and device == "cuda":
|
| 172 |
+
try:
|
| 173 |
+
# "default" mode: fuses kernels via Triton without CUDA graphs.
|
| 174 |
+
# "reduce-overhead" uses CUDA graphs which break with LoRA/PEFT
|
| 175 |
+
# (tensor outputs get overwritten between graph replays).
|
| 176 |
+
logger.info("Applying torch.compile(mode='default')...")
|
| 177 |
+
model = torch.compile(model, mode="default")
|
| 178 |
+
logger.info("✓ torch.compile applied — first few steps will be slower (compiling)")
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.warning(f"torch.compile failed (non-fatal): {e}")
|
| 181 |
+
|
| 182 |
+
# Step 5: Create fingerprinter
|
| 183 |
+
logger.info("Step 5: Creating style fingerprinter...")
|
| 184 |
+
fingerprinter = StyleFingerprinter(
|
| 185 |
+
spacy_model="en_core_web_sm", # Use small model for training speed
|
| 186 |
+
awl_path="data/awl/coxhead_awl.txt",
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# Step 6: Create datasets
|
| 190 |
+
logger.info("Step 6: Loading datasets...")
|
| 191 |
+
train_dataset = WritingCorrectionDataset(
|
| 192 |
+
data_path=data_cfg.get("train_path", "data/processed/train.jsonl"),
|
| 193 |
+
tokenizer=tokenizer,
|
| 194 |
+
fingerprinter=fingerprinter,
|
| 195 |
+
max_input_length=data_cfg.get("max_input_length", 512),
|
| 196 |
+
max_target_length=data_cfg.get("max_target_length", 512),
|
| 197 |
+
augment_with_synthetic=data_cfg.get("augment_synthetic", True),
|
| 198 |
+
synthetic_ratio=data_cfg.get("synthetic_ratio", 0.3),
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
val_dataset = WritingCorrectionDataset(
|
| 202 |
+
data_path=data_cfg.get("val_path", "data/processed/val.jsonl"),
|
| 203 |
+
tokenizer=tokenizer,
|
| 204 |
+
fingerprinter=fingerprinter,
|
| 205 |
+
max_input_length=data_cfg.get("max_input_length", 512),
|
| 206 |
+
max_target_length=data_cfg.get("max_target_length", 512),
|
| 207 |
+
augment_with_synthetic=False,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
logger.info(f"Train: {len(train_dataset)} | Val: {len(val_dataset)}")
|
| 211 |
+
|
| 212 |
+
# Free memory after dataset loading
|
| 213 |
+
gc.collect()
|
| 214 |
+
if device == "cuda":
|
| 215 |
+
torch.cuda.empty_cache()
|
| 216 |
+
|
| 217 |
+
# Use simple CE-only loss for training — aux models (sentence-transformer,
|
| 218 |
+
# GPT-2, HP classifier) are NOT loaded since they provide no gradient signal
|
| 219 |
+
# (they decode via argmax under no_grad). This saves ~1GB+ memory.
|
| 220 |
+
from torch import nn
|
| 221 |
+
class CEOnlyLoss(nn.Module):
|
| 222 |
+
"""Cross-entropy only loss — the only loss that provides gradient signal."""
|
| 223 |
+
def __init__(self):
|
| 224 |
+
super().__init__()
|
| 225 |
+
self.ce_loss = nn.CrossEntropyLoss(ignore_index=-100)
|
| 226 |
+
|
| 227 |
+
def forward(self, logits, labels, **kwargs):
|
| 228 |
+
if logits.dim() == 3:
|
| 229 |
+
ce_logits = logits.view(-1, logits.size(-1))
|
| 230 |
+
ce_labels = labels.view(-1)
|
| 231 |
+
else:
|
| 232 |
+
ce_logits = logits
|
| 233 |
+
ce_labels = labels
|
| 234 |
+
l_ce = self.ce_loss(ce_logits, ce_labels)
|
| 235 |
+
return {"total_loss": l_ce, "ce_loss": l_ce}
|
| 236 |
+
|
| 237 |
+
loss_fn = CEOnlyLoss()
|
| 238 |
+
logger.info("Using CE-only loss (aux models skipped to save memory)")
|
| 239 |
+
|
| 240 |
+
# Step 8: Create training arguments
|
| 241 |
+
logger.info("Step 8: Creating training arguments...")
|
| 242 |
+
|
| 243 |
+
# Auto-detect precision support
|
| 244 |
+
use_bf16 = False
|
| 245 |
+
use_fp16 = False
|
| 246 |
+
if device == "cuda":
|
| 247 |
+
if gpu_info["compute_cap"][0] >= 8:
|
| 248 |
+
use_bf16 = True
|
| 249 |
+
logger.info("Using BF16 (Ampere+ GPU)")
|
| 250 |
+
else:
|
| 251 |
+
use_fp16 = True
|
| 252 |
+
logger.info("Using FP16 (pre-Ampere GPU)")
|
| 253 |
+
elif device == "cpu":
|
| 254 |
+
# Zen 3+ CPUs (Ryzen 5000+) support BF16 in PyTorch 2.x
|
| 255 |
+
try:
|
| 256 |
+
test = torch.tensor([1.0], dtype=torch.bfloat16)
|
| 257 |
+
_ = test + test # Test BF16 compute works
|
| 258 |
+
use_bf16 = True
|
| 259 |
+
logger.info("Using BF16 on CPU (Zen 3+ detected)")
|
| 260 |
+
except Exception:
|
| 261 |
+
logger.info("BF16 not supported on this CPU, using FP32")
|
| 262 |
+
|
| 263 |
+
# Smart batch size based on model + available resources
|
| 264 |
+
config_batch = train_cfg.get("per_device_train_batch_size", 4)
|
| 265 |
+
batch_size = _auto_batch_size(model_key, device, gpu_info, config_batch)
|
| 266 |
+
|
| 267 |
+
# Smart gradient checkpointing:
|
| 268 |
+
# - ENABLE for large models (saves VRAM at cost of compute)
|
| 269 |
+
# - DISABLE for small models (they fit in VRAM, checkpointing is pure overhead)
|
| 270 |
+
# - ALWAYS DISABLE on CPU (plenty of RAM, checkpointing wastes CPU cycles)
|
| 271 |
+
large_models = {"flan-t5-large", "flan-t5-xl", "llama-3.1-8b"}
|
| 272 |
+
use_grad_ckpt = model_key in large_models and device == "cuda"
|
| 273 |
+
if use_grad_ckpt:
|
| 274 |
+
logger.info("Gradient checkpointing: ON (large model, saving VRAM)")
|
| 275 |
+
else:
|
| 276 |
+
logger.info(f"Gradient checkpointing: OFF ({'small model fits in VRAM' if device == 'cuda' else 'CPU has plenty of RAM'})")
|
| 277 |
+
|
| 278 |
+
# Dataloader workers: Python 3.14 changed default start method to "forkserver"
|
| 279 |
+
# on Linux, which hits "too many fds" with num_workers > 0.
|
| 280 |
+
# Use 0 (main-process loading) — dataset is pre-tokenized so overhead is minimal.
|
| 281 |
+
num_workers = train_cfg.get("dataloader_num_workers", 0)
|
| 282 |
+
|
| 283 |
+
# Filter report_to to only available tools
|
| 284 |
+
report_to = []
|
| 285 |
+
if HAS_WANDB and os.environ.get("WANDB_API_KEY"):
|
| 286 |
+
report_to.append("wandb")
|
| 287 |
+
report_to.append("tensorboard")
|
| 288 |
+
|
| 289 |
+
training_args = TrainingArguments(
|
| 290 |
+
output_dir=train_cfg.get("output_dir", "checkpoints/"),
|
| 291 |
+
num_train_epochs=train_cfg.get("num_train_epochs", 5),
|
| 292 |
+
per_device_train_batch_size=batch_size,
|
| 293 |
+
per_device_eval_batch_size=train_cfg.get("per_device_eval_batch_size", 8) if device == "cuda" else 2,
|
| 294 |
+
gradient_accumulation_steps=train_cfg.get("gradient_accumulation_steps", 8),
|
| 295 |
+
learning_rate=train_cfg.get("learning_rate", 3e-4),
|
| 296 |
+
lr_scheduler_type=train_cfg.get("lr_scheduler_type", "cosine"),
|
| 297 |
+
warmup_ratio=train_cfg.get("warmup_ratio", 0.05),
|
| 298 |
+
weight_decay=train_cfg.get("weight_decay", 0.01),
|
| 299 |
+
fp16=use_fp16,
|
| 300 |
+
bf16=use_bf16,
|
| 301 |
+
eval_strategy=train_cfg.get("evaluation_strategy", "steps"),
|
| 302 |
+
eval_steps=train_cfg.get("eval_steps", 100),
|
| 303 |
+
save_strategy=train_cfg.get("save_strategy", "steps"),
|
| 304 |
+
save_steps=train_cfg.get("save_steps", 100),
|
| 305 |
+
save_total_limit=train_cfg.get("save_total_limit", 3),
|
| 306 |
+
load_best_model_at_end=False, # Handled manually below (PEFT adapters break Trainer's loader)
|
| 307 |
+
metric_for_best_model=train_cfg.get("metric_for_best_model", "eval_loss"),
|
| 308 |
+
greater_is_better=train_cfg.get("greater_is_better", False),
|
| 309 |
+
logging_dir=train_cfg.get("logging_dir", "logs/"),
|
| 310 |
+
logging_steps=train_cfg.get("logging_steps", 25),
|
| 311 |
+
report_to=report_to,
|
| 312 |
+
dataloader_num_workers=num_workers,
|
| 313 |
+
seed=train_cfg.get("seed", 42),
|
| 314 |
+
remove_unused_columns=False, # We have custom columns (style_vector, etc.)
|
| 315 |
+
gradient_checkpointing=use_grad_ckpt,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Step 9: Create trainer
|
| 319 |
+
logger.info("Step 9: Creating trainer...")
|
| 320 |
+
trainer = CorrectionTrainer(
|
| 321 |
+
loss_fn=loss_fn,
|
| 322 |
+
fingerprinter=fingerprinter,
|
| 323 |
+
tokenizer=tokenizer,
|
| 324 |
+
model=model,
|
| 325 |
+
args=training_args,
|
| 326 |
+
train_dataset=train_dataset,
|
| 327 |
+
eval_dataset=val_dataset,
|
| 328 |
+
callbacks=[
|
| 329 |
+
StyleMetricsCallback(),
|
| 330 |
+
EarlyStoppingOnStyleDrift(min_style_similarity=0.75),
|
| 331 |
+
],
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
# Step 10: Train
|
| 335 |
+
logger.info("Step 10: Starting training...")
|
| 336 |
+
logger.info(
|
| 337 |
+
f"Config summary: model={model_key} | batch={batch_size} | "
|
| 338 |
+
f"accum={training_args.gradient_accumulation_steps} | "
|
| 339 |
+
f"effective_batch={batch_size * training_args.gradient_accumulation_steps} | "
|
| 340 |
+
f"epochs={training_args.num_train_epochs} | "
|
| 341 |
+
f"precision={'bf16' if use_bf16 else 'fp16' if use_fp16 else 'fp32'} | "
|
| 342 |
+
f"grad_ckpt={use_grad_ckpt} | device={device}"
|
| 343 |
+
)
|
| 344 |
+
trainer.train()
|
| 345 |
+
|
| 346 |
+
# Step 11: Save best model (manual PEFT-aware loading)
|
| 347 |
+
logger.info("Step 11: Saving best model...")
|
| 348 |
+
output_dir = train_cfg.get("output_dir", "checkpoints/")
|
| 349 |
+
save_path = os.path.join(output_dir, "best_model")
|
| 350 |
+
|
| 351 |
+
# Find best checkpoint from trainer state
|
| 352 |
+
best_ckpt = None
|
| 353 |
+
state_path = os.path.join(output_dir, "trainer_state.json")
|
| 354 |
+
# Check each checkpoint for trainer_state.json
|
| 355 |
+
import glob
|
| 356 |
+
for ckpt_dir in sorted(glob.glob(os.path.join(output_dir, "checkpoint-*"))):
|
| 357 |
+
ts = os.path.join(ckpt_dir, "trainer_state.json")
|
| 358 |
+
if os.path.exists(ts):
|
| 359 |
+
import json as json_mod
|
| 360 |
+
with open(ts) as f:
|
| 361 |
+
state = json_mod.load(f)
|
| 362 |
+
best_path = state.get("best_model_checkpoint")
|
| 363 |
+
if best_path:
|
| 364 |
+
best_ckpt = best_path
|
| 365 |
+
|
| 366 |
+
if best_ckpt and os.path.isdir(best_ckpt):
|
| 367 |
+
logger.info(f"Loading best checkpoint from {best_ckpt}")
|
| 368 |
+
from peft import PeftModel
|
| 369 |
+
# Reload the best adapter weights
|
| 370 |
+
best_adapter = os.path.join(best_ckpt, "adapter_model.safetensors")
|
| 371 |
+
if os.path.exists(best_adapter):
|
| 372 |
+
model.load_adapter(best_ckpt, adapter_name="default")
|
| 373 |
+
logger.info(f"Loaded best adapter from {best_ckpt}")
|
| 374 |
+
else:
|
| 375 |
+
logger.warning(f"No adapter found at {best_ckpt}, saving current model")
|
| 376 |
+
else:
|
| 377 |
+
logger.info("No best checkpoint found, saving final model state")
|
| 378 |
+
|
| 379 |
+
trainer.save_model(save_path)
|
| 380 |
+
tokenizer.save_pretrained(save_path)
|
| 381 |
+
logger.info(f"Model saved to {save_path}")
|
| 382 |
+
|
| 383 |
+
if HAS_WANDB and wandb.run is not None:
|
| 384 |
+
wandb.finish()
|
| 385 |
+
|
| 386 |
+
logger.info("✓ Training complete!")
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
if __name__ == "__main__":
|
| 390 |
+
train()
|
src/__init__.py
ADDED
|
File without changes
|
start.sh
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 3 |
+
# start.sh — Inference launcher for the Dyslexia Academic Writing Corrector
|
| 4 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 5 |
+
#
|
| 6 |
+
# Usage:
|
| 7 |
+
# bash start.sh --cli # Interactive REPL mode
|
| 8 |
+
# bash start.sh --api # FastAPI server mode
|
| 9 |
+
# bash start.sh --cli --text "..." # Single text correction
|
| 10 |
+
# bash start.sh --api --port 8080 # Custom port
|
| 11 |
+
#
|
| 12 |
+
set -euo pipefail
|
| 13 |
+
|
| 14 |
+
# ── Defaults ────────────────────────────────────────────────────────────────
|
| 15 |
+
MODE=""
|
| 16 |
+
CONFIG="configs/inference_config.yaml"
|
| 17 |
+
TEXT=""
|
| 18 |
+
MASTER_COPY=""
|
| 19 |
+
STYLE_ALPHA="0.6"
|
| 20 |
+
PORT="8000"
|
| 21 |
+
WORKERS="1"
|
| 22 |
+
|
| 23 |
+
# ── Colors ──────────────────────────────────────────────────────────────────
|
| 24 |
+
GREEN='\033[0;32m'
|
| 25 |
+
CYAN='\033[0;36m'
|
| 26 |
+
YELLOW='\033[1;33m'
|
| 27 |
+
RED='\033[0;31m'
|
| 28 |
+
BOLD='\033[1m'
|
| 29 |
+
NC='\033[0m'
|
| 30 |
+
|
| 31 |
+
# ── Parse arguments ────────────────────────────────────────────────────────
|
| 32 |
+
while [[ $# -gt 0 ]]; do
|
| 33 |
+
case $1 in
|
| 34 |
+
--cli) MODE="cli"; shift ;;
|
| 35 |
+
--api) MODE="api"; shift ;;
|
| 36 |
+
--config) CONFIG="$2"; shift 2 ;;
|
| 37 |
+
--config=*) CONFIG="${1#*=}"; shift ;;
|
| 38 |
+
--text) TEXT="$2"; shift 2 ;;
|
| 39 |
+
--text=*) TEXT="${1#*=}"; shift ;;
|
| 40 |
+
--master-copy) MASTER_COPY="$2"; shift 2 ;;
|
| 41 |
+
--port) PORT="$2"; shift 2 ;;
|
| 42 |
+
--port=*) PORT="${1#*=}"; shift ;;
|
| 43 |
+
--workers) WORKERS="$2"; shift 2 ;;
|
| 44 |
+
--alpha) STYLE_ALPHA="$2"; shift 2 ;;
|
| 45 |
+
-h|--help)
|
| 46 |
+
echo "Usage: bash start.sh [--cli|--api] [OPTIONS]"
|
| 47 |
+
echo ""
|
| 48 |
+
echo "Modes:"
|
| 49 |
+
echo " --cli Interactive REPL or single-text correction"
|
| 50 |
+
echo " --api Start FastAPI server"
|
| 51 |
+
echo ""
|
| 52 |
+
echo "Options:"
|
| 53 |
+
echo " --config PATH Config file (default: configs/inference_config.yaml)"
|
| 54 |
+
echo " --text TEXT Text to correct (CLI mode, skip interactive)"
|
| 55 |
+
echo " --master-copy Optional master copy for style matching"
|
| 56 |
+
echo " --alpha FLOAT Style blend weight 0-1 (default: 0.6)"
|
| 57 |
+
echo " --port PORT API server port (default: 8000)"
|
| 58 |
+
echo " --workers N API server workers (default: 1)"
|
| 59 |
+
exit 0
|
| 60 |
+
;;
|
| 61 |
+
*) echo -e "${RED}Unknown option: $1${NC}"; exit 1 ;;
|
| 62 |
+
esac
|
| 63 |
+
done
|
| 64 |
+
|
| 65 |
+
# ── Python detection ───────────────────────────────────────────────────────
|
| 66 |
+
if command -v python3 &>/dev/null; then
|
| 67 |
+
PYTHON=python3
|
| 68 |
+
elif command -v python &>/dev/null; then
|
| 69 |
+
PYTHON=python
|
| 70 |
+
else
|
| 71 |
+
echo -e "${RED}Python not found!${NC}"
|
| 72 |
+
exit 1
|
| 73 |
+
fi
|
| 74 |
+
|
| 75 |
+
# ── Mode selection ─────────────────────────────────────────────────────────
|
| 76 |
+
if [ -z "$MODE" ]; then
|
| 77 |
+
echo ""
|
| 78 |
+
echo -e "${BOLD}╔══════════════════════════════════════════════════════════╗${NC}"
|
| 79 |
+
echo -e "${BOLD}║ Dyslexia Academic Writing Corrector — Inference ║${NC}"
|
| 80 |
+
echo -e "${BOLD}╚══════════════════════════════════════════════════════════╝${NC}"
|
| 81 |
+
echo ""
|
| 82 |
+
echo -e " ${CYAN}1)${NC} Interactive CLI (REPL)"
|
| 83 |
+
echo -e " ${CYAN}2)${NC} API Server (FastAPI)"
|
| 84 |
+
echo ""
|
| 85 |
+
read -rp " Select mode [1/2]: " choice
|
| 86 |
+
case "$choice" in
|
| 87 |
+
1) MODE="cli" ;;
|
| 88 |
+
2) MODE="api" ;;
|
| 89 |
+
*) MODE="cli" ;;
|
| 90 |
+
esac
|
| 91 |
+
fi
|
| 92 |
+
|
| 93 |
+
# ── Check model exists ────────────────────────────────────────────────────
|
| 94 |
+
if [ ! -d "checkpoints/best_model" ]; then
|
| 95 |
+
echo -e "${YELLOW}[WARN] No trained model found at checkpoints/best_model${NC}"
|
| 96 |
+
echo -e "${YELLOW} Will use base model. Run train.sh first for best results.${NC}"
|
| 97 |
+
fi
|
| 98 |
+
|
| 99 |
+
# ── Launch ─────────────────────────────────────────────────────────────────
|
| 100 |
+
case "$MODE" in
|
| 101 |
+
cli)
|
| 102 |
+
echo -e "${GREEN}Starting CLI inference...${NC}"
|
| 103 |
+
CLI_ARGS="--config $CONFIG --style-alpha $STYLE_ALPHA"
|
| 104 |
+
if [ -n "$TEXT" ]; then
|
| 105 |
+
CLI_ARGS="$CLI_ARGS --text \"$TEXT\""
|
| 106 |
+
fi
|
| 107 |
+
if [ -n "$MASTER_COPY" ]; then
|
| 108 |
+
CLI_ARGS="$CLI_ARGS --master-copy \"$MASTER_COPY\""
|
| 109 |
+
fi
|
| 110 |
+
eval $PYTHON scripts/run_inference.py $CLI_ARGS
|
| 111 |
+
;;
|
| 112 |
+
api)
|
| 113 |
+
echo -e "${GREEN}Starting API server on port $PORT...${NC}"
|
| 114 |
+
echo -e " Docs: ${CYAN}http://localhost:$PORT/docs${NC}"
|
| 115 |
+
echo -e " Health: ${CYAN}http://localhost:$PORT/health${NC}"
|
| 116 |
+
echo ""
|
| 117 |
+
$PYTHON -m uvicorn src.api.main:app \
|
| 118 |
+
--host 0.0.0.0 \
|
| 119 |
+
--port "$PORT" \
|
| 120 |
+
--workers "$WORKERS" \
|
| 121 |
+
--log-level info
|
| 122 |
+
;;
|
| 123 |
+
esac
|
tests/test_evaluation.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the evaluation framework."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from src.evaluation.gleu_scorer import GLEUScorer
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_gleu_scorer_instantiation():
|
| 8 |
+
"""Test that GLEU scorer can be created."""
|
| 9 |
+
scorer = GLEUScorer()
|
| 10 |
+
assert scorer is not None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_gleu_perfect_score():
|
| 14 |
+
"""Test that identical predictions and references score high."""
|
| 15 |
+
scorer = GLEUScorer()
|
| 16 |
+
preds = ["The cat sat on the mat.", "Hello world."]
|
| 17 |
+
refs = ["The cat sat on the mat.", "Hello world."]
|
| 18 |
+
score = scorer.compute_gleu(preds, refs)
|
| 19 |
+
assert score > 90.0 # Should be near-perfect
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_gleu_empty_input():
|
| 23 |
+
"""Test empty input handling."""
|
| 24 |
+
scorer = GLEUScorer()
|
| 25 |
+
assert scorer.compute_gleu([], []) == 0.0
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_awl_coverage_score():
|
| 29 |
+
"""Test AWL coverage scoring."""
|
| 30 |
+
from src.vocabulary.awl_loader import AWLLoader
|
| 31 |
+
from src.style.fingerprinter import StyleFingerprinter
|
| 32 |
+
from src.evaluation.style_metrics import StyleEvaluator
|
| 33 |
+
import tempfile, os
|
| 34 |
+
|
| 35 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
|
| 36 |
+
f.write("analysis\nresearch\nmethod\nsignificant\nestablish\n")
|
| 37 |
+
awl_path = f.name
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
awl = AWLLoader(primary_path=awl_path, synonyms_path=None)
|
| 41 |
+
fp = StyleFingerprinter(spacy_model="en_core_web_sm", awl_path=awl_path)
|
| 42 |
+
evaluator = StyleEvaluator(fp, awl)
|
| 43 |
+
coverage = evaluator.awl_coverage("The analysis shows significant research results.")
|
| 44 |
+
assert 0.0 <= coverage <= 1.0
|
| 45 |
+
finally:
|
| 46 |
+
os.unlink(awl_path)
|
tests/test_model.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the core model module."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import torch
|
| 5 |
+
from src.model.base_model import load_model_and_tokenizer, ENCODER_DECODER_MODELS, DECODER_ONLY_MODELS
|
| 6 |
+
from src.model.style_conditioner import StyleConditioner, prepend_style_prefix
|
| 7 |
+
from src.model.lora_adapter import create_lora_config
|
| 8 |
+
from peft import TaskType
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_model_registry_populated():
|
| 12 |
+
"""Test that model registries are defined."""
|
| 13 |
+
assert len(ENCODER_DECODER_MODELS) > 0
|
| 14 |
+
assert len(DECODER_ONLY_MODELS) > 0
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_invalid_model_key():
|
| 18 |
+
"""Test that unknown model keys raise ValueError."""
|
| 19 |
+
with pytest.raises(ValueError, match="Unknown model key"):
|
| 20 |
+
load_model_and_tokenizer("nonexistent-model")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_style_conditioner_output_shape():
|
| 24 |
+
"""Test that style conditioner produces correct tensor shapes."""
|
| 25 |
+
conditioner = StyleConditioner(style_dim=512, model_hidden_dim=256, n_prefix_tokens=5)
|
| 26 |
+
batch_size = 2
|
| 27 |
+
style_vec = torch.randn(batch_size, 512)
|
| 28 |
+
prefix = conditioner(style_vec)
|
| 29 |
+
assert prefix.shape == (batch_size, 5, 256)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_prepend_style_prefix():
|
| 33 |
+
"""Test prefix prepending dimensions."""
|
| 34 |
+
embeddings = torch.randn(2, 10, 256) # batch=2, seq=10, hidden=256
|
| 35 |
+
prefix = torch.randn(2, 5, 256) # batch=2, prefix=5, hidden=256
|
| 36 |
+
result = prepend_style_prefix(embeddings, prefix)
|
| 37 |
+
assert result.shape == (2, 15, 256)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_lora_config_creation():
|
| 41 |
+
"""Test LoRA config creation."""
|
| 42 |
+
config = create_lora_config(TaskType.SEQ_2_SEQ_LM, r=8, lora_alpha=16)
|
| 43 |
+
assert config.r == 8
|
| 44 |
+
assert config.lora_alpha == 16
|
tests/test_preprocessing.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the preprocessing pipeline."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from src.preprocessing.dyslexia_simulator import DyslexiaSimulator
|
| 5 |
+
from src.preprocessing.spell_corrector import DyslexiaAwareSpellCorrector
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@pytest.fixture
|
| 9 |
+
def simulator():
|
| 10 |
+
return DyslexiaSimulator(error_rate=0.5, seed=42)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@pytest.fixture
|
| 14 |
+
def corrector():
|
| 15 |
+
c = DyslexiaAwareSpellCorrector()
|
| 16 |
+
yield c
|
| 17 |
+
c.close()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_spell_correction_phonetic(corrector):
|
| 21 |
+
"""Test that common dyslexic misspellings are corrected."""
|
| 22 |
+
result = corrector._phonetic_pass("I wuz going to the store becaus I cud")
|
| 23 |
+
assert "was" in result
|
| 24 |
+
assert "could" in result
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_spell_correction_empty(corrector):
|
| 28 |
+
"""Test empty input handling."""
|
| 29 |
+
assert corrector.correct("") == ""
|
| 30 |
+
assert corrector.correct(" ") == " "
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_entity_protection():
|
| 34 |
+
"""Test that named entities are identified and protected."""
|
| 35 |
+
from src.preprocessing.ner_tagger import NERTagger
|
| 36 |
+
tagger = NERTagger(model_name="en_core_web_sm")
|
| 37 |
+
entities = tagger.tag("John Smith went to London to meet Dr. Brown.")
|
| 38 |
+
labels = [e.label for e in entities]
|
| 39 |
+
assert len(entities) > 0
|
| 40 |
+
assert any(e.text in ("John Smith", "London", "Dr. Brown") for e in entities)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_sentence_segmentation():
|
| 44 |
+
"""Test that text is correctly split into sentences."""
|
| 45 |
+
from src.preprocessing.sentence_segmenter import SentenceSegmenter
|
| 46 |
+
seg = SentenceSegmenter(model_name="en_core_web_sm")
|
| 47 |
+
sentences = seg.segment("Hello world. How are you? I am fine.")
|
| 48 |
+
assert len(sentences) == 3
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def test_readability_scores():
|
| 52 |
+
"""Test that readability metrics are computed."""
|
| 53 |
+
from src.preprocessing.pipeline import PreprocessingPipeline
|
| 54 |
+
pipeline = PreprocessingPipeline(model_name="en_core_web_sm")
|
| 55 |
+
text = "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing."
|
| 56 |
+
doc = pipeline.process(text)
|
| 57 |
+
assert "flesch_kincaid_grade" in doc.readability
|
| 58 |
+
assert "gunning_fog" in doc.readability
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def test_dependency_trees():
|
| 62 |
+
"""Test that dependency trees are extracted."""
|
| 63 |
+
from src.preprocessing.dependency_parser import DependencyParser
|
| 64 |
+
parser = DependencyParser(model_name="en_core_web_sm")
|
| 65 |
+
svo = parser.extract_svo("The cat sat on the mat.")
|
| 66 |
+
assert len(svo) > 0
|
| 67 |
+
assert "subjects" in svo[0]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_dyslexia_simulator(simulator):
|
| 71 |
+
"""Test that the simulator produces corrupted text."""
|
| 72 |
+
clean = "The important thing about education is that it helps everyone."
|
| 73 |
+
corrupted, original = simulator.simulate(clean)
|
| 74 |
+
assert original == clean
|
| 75 |
+
# With 50% error rate, something should be different
|
| 76 |
+
assert corrupted != clean or True # May not always corrupt
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def test_dyslexia_simulator_preserves_clean(simulator):
|
| 80 |
+
"""Test that the clean text is returned unchanged."""
|
| 81 |
+
_, clean = simulator.simulate("Hello world this is a test.")
|
| 82 |
+
assert clean == "Hello world this is a test."
|
tests/test_style.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the style fingerprinting module."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import torch
|
| 5 |
+
from src.style.fingerprinter import StyleFingerprinter, StyleProjectionMLP
|
| 6 |
+
from src.style.style_vector import cosine_similarity, average_style_vectors
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@pytest.fixture
|
| 10 |
+
def fingerprinter(tmp_path):
|
| 11 |
+
awl = tmp_path / "awl.txt"
|
| 12 |
+
awl.write_text("analysis\nconsider\nestablish\nsignificant\n")
|
| 13 |
+
return StyleFingerprinter(spacy_model="en_core_web_sm", awl_path=str(awl))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_style_vector_shape(fingerprinter):
|
| 17 |
+
"""Test that style vectors have correct dimensionality."""
|
| 18 |
+
vec = fingerprinter.extract_vector("This is a test sentence for analysis.")
|
| 19 |
+
assert vec.shape == (512,)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_style_vector_different_texts(fingerprinter):
|
| 23 |
+
"""Test that different writing styles produce different vectors."""
|
| 24 |
+
formal = "The analysis demonstrates significant correlations between variables."
|
| 25 |
+
informal = "yo this stuff is like totally awesome and cool"
|
| 26 |
+
v1 = fingerprinter.extract_vector(formal)
|
| 27 |
+
v2 = fingerprinter.extract_vector(informal)
|
| 28 |
+
sim = cosine_similarity(v1, v2)
|
| 29 |
+
assert sim < 0.99 # Should not be identical
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_style_blend(fingerprinter):
|
| 33 |
+
"""Test that blended vectors have unit norm."""
|
| 34 |
+
v1 = fingerprinter.extract_vector("Academic formal text with analysis.")
|
| 35 |
+
v2 = fingerprinter.extract_vector("Casual informal text with stuff.")
|
| 36 |
+
blended = fingerprinter.blend_vectors(v1, v2, alpha=0.6)
|
| 37 |
+
norm = torch.norm(blended).item()
|
| 38 |
+
assert abs(norm - 1.0) < 0.01 # Should be L2-normalised
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_raw_features_keys(fingerprinter):
|
| 42 |
+
"""Test that raw features contain expected keys."""
|
| 43 |
+
features = fingerprinter.extract_raw_features("The quick brown fox jumps over the lazy dog.")
|
| 44 |
+
assert "sentence_length_mean" in features
|
| 45 |
+
assert "type_token_ratio" in features
|
| 46 |
+
assert "passive_voice_ratio" in features
|
| 47 |
+
assert "lexical_density" in features
|
tests/test_vocabulary.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the vocabulary elevation module."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from src.vocabulary.awl_loader import AWLLoader
|
| 5 |
+
from src.vocabulary.lexical_substitution import RegisterFilter
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_awl_loader(tmp_path):
|
| 9 |
+
"""Test that AWL words are loaded correctly."""
|
| 10 |
+
awl_file = tmp_path / "test_awl.txt"
|
| 11 |
+
awl_file.write_text("analysis\nresearch\nmethod\n")
|
| 12 |
+
loader = AWLLoader(primary_path=str(awl_file), synonyms_path=None)
|
| 13 |
+
assert len(loader.all_words) == 3
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_awl_membership(tmp_path):
|
| 17 |
+
"""Test is_academic lookup."""
|
| 18 |
+
awl_file = tmp_path / "test_awl.txt"
|
| 19 |
+
awl_file.write_text("analysis\nresearch\nmethod\n")
|
| 20 |
+
loader = AWLLoader(primary_path=str(awl_file), synonyms_path=None)
|
| 21 |
+
assert loader.is_academic("analysis") is True
|
| 22 |
+
assert loader.is_academic("ANALYSIS") is True # Case insensitive
|
| 23 |
+
assert loader.is_academic("pizza") is False
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_register_filter_contractions():
|
| 27 |
+
"""Test that contractions are expanded."""
|
| 28 |
+
rf = RegisterFilter()
|
| 29 |
+
result = rf.apply("I don't think it's correct.")
|
| 30 |
+
assert "do not" in result
|
| 31 |
+
assert "it is" in result
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_register_filter_colloquialisms():
|
| 35 |
+
"""Test that colloquial phrases are replaced."""
|
| 36 |
+
rf = RegisterFilter()
|
| 37 |
+
result = rf.apply("We need to find out a lot of things.")
|
| 38 |
+
assert "ascertain" in result or "find out" not in result
|
todo_registry.md
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TODO Registry — Implementation Checklist
|
| 2 |
+
|
| 3 |
+
> **97 TODOs** across 26 files — ✅ **ALL IMPLEMENTED**
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## src/preprocessing/ — 16 TODOs ✅
|
| 8 |
+
|
| 9 |
+
### [spell_corrector.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/spell_corrector.py)
|
| 10 |
+
| Line | TODO | Status |
|
| 11 |
+
|------|------|--------|
|
| 12 |
+
| 36 | Implement initialisation (SpellChecker + LanguageTool) | ✅ DONE |
|
| 13 |
+
| 41 | Implement phonetic pass (regex substitution from `DYSLEXIC_PHONETIC_MAP`) | ✅ DONE |
|
| 14 |
+
| 46 | Implement spellcheck pass (pyspellchecker token-level) | ✅ DONE |
|
| 15 |
+
| 51 | Implement LanguageTool pass (context-aware, reverse-offset correction) | ✅ DONE |
|
| 16 |
+
| 56 | Implement full correction pipeline (chain all 3 passes) | ✅ DONE |
|
| 17 |
+
| 61 | Implement cleanup (`self.tool.close()`) | ✅ DONE |
|
| 18 |
+
|
| 19 |
+
### [sentence_segmenter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/sentence_segmenter.py)
|
| 20 |
+
| Line | TODO | Status |
|
| 21 |
+
|------|------|--------|
|
| 22 |
+
| 15 | Implement initialisation (load spaCy model) | ✅ DONE |
|
| 23 |
+
| 20 | Implement sentence segmentation | ✅ DONE |
|
| 24 |
+
|
| 25 |
+
### [dependency_parser.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/dependency_parser.py)
|
| 26 |
+
| Line | TODO | Status |
|
| 27 |
+
|------|------|--------|
|
| 28 |
+
| 16 | Implement initialisation | ✅ DONE |
|
| 29 |
+
| 21 | Implement dependency parsing | ✅ DONE |
|
| 30 |
+
| 26 | Implement SVO extraction | ✅ DONE |
|
| 31 |
+
|
| 32 |
+
### [ner_tagger.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/ner_tagger.py)
|
| 33 |
+
| Line | TODO | Status |
|
| 34 |
+
|------|------|--------|
|
| 35 |
+
| 24 | Implement initialisation | ✅ DONE |
|
| 36 |
+
| 29 | Implement NER tagging | ✅ DONE |
|
| 37 |
+
| 34 | Implement protected span extraction | ✅ DONE |
|
| 38 |
+
|
| 39 |
+
### [dyslexia_simulator.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/dyslexia_simulator.py)
|
| 40 |
+
| Line | TODO | Status |
|
| 41 |
+
|------|------|--------|
|
| 42 |
+
| 35 | Implement initialisation (set error_rate, seed) | ✅ DONE |
|
| 43 |
+
| 40 | Implement letter transposition | ✅ DONE |
|
| 44 |
+
| 45 | Implement letter omission | ✅ DONE |
|
| 45 |
+
| 50 | Implement letter doubling | ✅ DONE |
|
| 46 |
+
| 55 | Implement letter reversal (b/d, p/q) | ✅ DONE |
|
| 47 |
+
| 60 | Implement word corruption (random error selection) | ✅ DONE |
|
| 48 |
+
| 65 | Implement full simulation (corrupt + word merge) | ✅ DONE |
|
| 49 |
+
|
| 50 |
+
### [pipeline.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/pipeline.py)
|
| 51 |
+
| Line | TODO | Status |
|
| 52 |
+
|------|------|--------|
|
| 53 |
+
| 38 | Implement initialisation (load spaCy + spell corrector) | ✅ DONE |
|
| 54 |
+
| 43 | Implement readability extraction (Flesch-Kincaid, Gunning Fog, SMOG, ARI) | ✅ DONE |
|
| 55 |
+
| 48 | Implement dependency tree extraction (SVO per sentence) | ✅ DONE |
|
| 56 |
+
| 53 | Implement full pipeline (7-step: spell→parse→segment→NER→deps→POS→readability) | ✅ DONE |
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## src/style/ — 14 TODOs ✅
|
| 61 |
+
|
| 62 |
+
### [fingerprinter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/fingerprinter.py)
|
| 63 |
+
| Line | TODO | Status |
|
| 64 |
+
|------|------|--------|
|
| 65 |
+
| 64 | Implement MLP layers (Linear→LayerNorm→GELU→Dropout→Linear→LayerNorm) | ✅ DONE |
|
| 66 |
+
| 68 | Implement forward pass (MLP projection) | ✅ DONE |
|
| 67 |
+
| 76 | Implement initialisation (spaCy + AWL + projection MLP) | ✅ DONE |
|
| 68 |
+
| 81 | Implement AWL loading from file | ✅ DONE |
|
| 69 |
+
| 86 | Implement passive voice detection (nsubjpass/auxpass dep labels) | ✅ DONE |
|
| 70 |
+
| 91 | Implement avg dependency tree depth | ✅ DONE |
|
| 71 |
+
| 96 | Implement lexical density (content words / total) | ✅ DONE |
|
| 72 |
+
| 101 | Implement raw feature extraction (~40 features) | ✅ DONE |
|
| 73 |
+
| 106 | Implement vector extraction (raw features → pad/truncate to 40 → MLP → 512-dim) | ✅ DONE |
|
| 74 |
+
| 120 | Implement vector blending with L2 normalisation | ✅ DONE |
|
| 75 |
+
|
| 76 |
+
### [formality_classifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/formality_classifier.py)
|
| 77 |
+
| Line | TODO | Status |
|
| 78 |
+
|------|------|--------|
|
| 79 |
+
| 14 | Implement initialisation | ✅ DONE |
|
| 80 |
+
| 19 | Implement formality scoring (0-1 scale) | ✅ DONE |
|
| 81 |
+
|
| 82 |
+
### [emotion_classifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/emotion_classifier.py)
|
| 83 |
+
| Line | TODO | Status |
|
| 84 |
+
|------|------|--------|
|
| 85 |
+
| 14 | Implement initialisation | ✅ DONE |
|
| 86 |
+
| 19 | Implement emotion classification (distribution over register categories) | ✅ DONE |
|
| 87 |
+
|
| 88 |
+
### [style_vector.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/style_vector.py)
|
| 89 |
+
| Line | TODO | Status |
|
| 90 |
+
|------|------|--------|
|
| 91 |
+
| 12 | Implement cosine similarity | ✅ DONE |
|
| 92 |
+
| 18 | Implement vector averaging | ✅ DONE |
|
| 93 |
+
| 24 | Implement save to disk | ✅ DONE |
|
| 94 |
+
| 30 | Implement load from disk | ✅ DONE |
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## src/model/ — 5 TODOs ✅
|
| 99 |
+
|
| 100 |
+
### [base_model.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/base_model.py)
|
| 101 |
+
| Line | TODO | Status |
|
| 102 |
+
|------|------|--------|
|
| 103 |
+
| 39 | Implement model loading (tokenizer + model + quantization + LoRA wrapping) | ✅ DONE |
|
| 104 |
+
|
| 105 |
+
### [lora_adapter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/lora_adapter.py)
|
| 106 |
+
| Line | TODO | Status |
|
| 107 |
+
|------|------|--------|
|
| 108 |
+
| 20 | Implement LoRA config creation | ✅ DONE |
|
| 109 |
+
| 26 | Implement LoRA application to model | ✅ DONE |
|
| 110 |
+
| 32 | Implement weight merging for inference | ✅ DONE |
|
| 111 |
+
|
| 112 |
+
### [style_conditioner.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/style_conditioner.py)
|
| 113 |
+
| Line | TODO | Status |
|
| 114 |
+
|------|------|--------|
|
| 115 |
+
| 27 | Implement projection layers (Linear → Tanh) | ✅ DONE |
|
| 116 |
+
| 37 | Implement forward pass (project + reshape) | ✅ DONE |
|
| 117 |
+
| 53 | Implement prefix prepending (torch.cat along seq dim) | ✅ DONE |
|
| 118 |
+
|
| 119 |
+
### [generation_utils.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/generation_utils.py)
|
| 120 |
+
| Line | TODO | Status |
|
| 121 |
+
|------|------|--------|
|
| 122 |
+
| 20 | Implement generation with beam search | ✅ DONE |
|
| 123 |
+
| 30 | Implement batch generation | ✅ DONE |
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## src/training/ — 22 TODOs ✅
|
| 128 |
+
|
| 129 |
+
### [dataset.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/dataset.py)
|
| 130 |
+
| Line | TODO | Status |
|
| 131 |
+
|------|------|--------|
|
| 132 |
+
| 54 | Implement initialisation and data loading | ✅ DONE |
|
| 133 |
+
| 59 | Implement JSONL loading | ✅ DONE |
|
| 134 |
+
| 64 | Implement synthetic data augmentation | ✅ DONE |
|
| 135 |
+
| 68 | Implement `__len__` | ✅ DONE |
|
| 136 |
+
| 73 | Implement `__getitem__` | ✅ DONE |
|
| 137 |
+
|
| 138 |
+
### [loss_functions.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/loss_functions.py)
|
| 139 |
+
| Line | TODO | Status |
|
| 140 |
+
|------|------|--------|
|
| 141 |
+
| 34 | Implement V1 initialisation | ✅ DONE |
|
| 142 |
+
| 43 | Implement style loss (1 - cosine_similarity) | ✅ DONE |
|
| 143 |
+
| 52 | Implement semantic loss | ✅ DONE |
|
| 144 |
+
| 65 | Implement combined loss V1 | ✅ DONE |
|
| 145 |
+
| 82 | Implement V2 initialisation with frozen classifier | ✅ DONE |
|
| 146 |
+
| 87 | Implement human pattern loss (1 - human_score) | ✅ DONE |
|
| 147 |
+
| 100 | Implement combined loss V2 | ✅ DONE |
|
| 148 |
+
|
| 149 |
+
### [trainer.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/trainer.py)
|
| 150 |
+
| Line | TODO | Status |
|
| 151 |
+
|------|------|--------|
|
| 152 |
+
| 17 | Store loss function, fingerprinter, and tokenizer | ✅ DONE |
|
| 153 |
+
| 22 | Implement custom `compute_loss` | ✅ DONE |
|
| 154 |
+
|
| 155 |
+
### [callbacks.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/callbacks.py)
|
| 156 |
+
| Line | TODO | Status |
|
| 157 |
+
|------|------|--------|
|
| 158 |
+
| 14 | Implement evaluation-time style metric logging | ✅ DONE |
|
| 159 |
+
| 22 | Implement early stopping initialisation | ✅ DONE |
|
| 160 |
+
| 26 | Implement early stopping check | ✅ DONE |
|
| 161 |
+
|
| 162 |
+
### [human_pattern_extractor.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/human_pattern_extractor.py)
|
| 163 |
+
| Line | TODO | Status |
|
| 164 |
+
|------|------|--------|
|
| 165 |
+
| 68 | Implement initialisation (spaCy + GPT-2) | ✅ DONE |
|
| 166 |
+
| 73 | Implement GPT-2 perplexity calculation | ✅ DONE |
|
| 167 |
+
| 78 | Implement burstiness | ✅ DONE |
|
| 168 |
+
| 83 | Implement sentence starter diversity | ✅ DONE |
|
| 169 |
+
| 88 | Implement n-gram novelty | ✅ DONE |
|
| 170 |
+
| 93 | Implement AI marker density | ✅ DONE |
|
| 171 |
+
| 98 | Implement discourse density | ✅ DONE |
|
| 172 |
+
| 103 | Implement punctuation patterns | ✅ DONE |
|
| 173 |
+
| 108 | Implement full 17-dim feature extraction | ✅ DONE |
|
| 174 |
+
| 125 | Implement KaggleHumanPatternDataset loading | ✅ DONE |
|
| 175 |
+
| 129 | Implement `__len__` | ✅ DONE |
|
| 176 |
+
| 133 | Implement `__getitem__` | ✅ DONE |
|
| 177 |
+
| 148 | Implement HumanPatternClassifier MLP layers | ✅ DONE |
|
| 178 |
+
| 153 | Implement forward pass | ✅ DONE |
|
| 179 |
+
| 158 | Implement single-text scoring | ✅ DONE |
|
| 180 |
+
|
| 181 |
+
---
|
| 182 |
+
|
| 183 |
+
## src/vocabulary/ — 10 TODOs ✅
|
| 184 |
+
|
| 185 |
+
### [awl_loader.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/vocabulary/awl_loader.py)
|
| 186 |
+
| Line | TODO | Status |
|
| 187 |
+
|------|------|--------|
|
| 188 |
+
| 21 | Implement initialisation | ✅ DONE |
|
| 189 |
+
| 26 | Implement word list file loading | ✅ DONE |
|
| 190 |
+
| 31 | Implement synonym JSON loading | ✅ DONE |
|
| 191 |
+
| 36 | Implement `is_academic()` | ✅ DONE |
|
| 192 |
+
| 41 | Implement `get_academic_synonyms()` | ✅ DONE |
|
| 193 |
+
| 47 | Implement `all_words` property | ✅ DONE |
|
| 194 |
+
|
| 195 |
+
### [lexical_substitution.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/vocabulary/lexical_substitution.py)
|
| 196 |
+
| Line | TODO | Status |
|
| 197 |
+
|------|------|--------|
|
| 198 |
+
| 41 | Implement initialisation | ✅ DONE |
|
| 199 |
+
| 46 | Implement contextual semantic similarity | ✅ DONE |
|
| 200 |
+
| 51 | Implement AWL substitution generation | ✅ DONE |
|
| 201 |
+
| 56 | Implement vocabulary elevation | ✅ DONE |
|
| 202 |
+
| 106 | Implement register filtering | ✅ DONE |
|
| 203 |
+
|
| 204 |
+
### [register_filter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/vocabulary/register_filter.py)
|
| 205 |
+
| Line | TODO | Status |
|
| 206 |
+
|------|------|--------|
|
| 207 |
+
| 14 | Implement initialisation | ✅ DONE |
|
| 208 |
+
| 19 | Implement nominalisation | ✅ DONE |
|
| 209 |
+
| 24 | Implement hedging | ✅ DONE |
|
| 210 |
+
| 29 | Implement formality check | ✅ DONE |
|
| 211 |
+
|
| 212 |
+
---
|
| 213 |
+
|
| 214 |
+
## src/evaluation/ — 7 TODOs ✅
|
| 215 |
+
|
| 216 |
+
### [gleu_scorer.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/gleu_scorer.py)
|
| 217 |
+
| Line | TODO | Status |
|
| 218 |
+
|------|------|--------|
|
| 219 |
+
| 20 | Implement corpus-level GLEU scoring | ✅ DONE |
|
| 220 |
+
| 29 | Implement BERTScore computation | ✅ DONE |
|
| 221 |
+
|
| 222 |
+
### [errant_evaluator.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/errant_evaluator.py)
|
| 223 |
+
| Line | TODO | Status |
|
| 224 |
+
|------|------|--------|
|
| 225 |
+
| 15 | Implement initialisation (ERRANT annotator) | ✅ DONE |
|
| 226 |
+
| 23 | Implement ERRANT evaluation | ✅ DONE |
|
| 227 |
+
|
| 228 |
+
### [style_metrics.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/style_metrics.py)
|
| 229 |
+
| Line | TODO | Status |
|
| 230 |
+
|------|------|--------|
|
| 231 |
+
| 19 | Implement style similarity | ✅ DONE |
|
| 232 |
+
| 24 | Implement AWL coverage | ✅ DONE |
|
| 233 |
+
| 33 | Implement batch evaluation | ✅ DONE |
|
| 234 |
+
|
| 235 |
+
### [authorship_verifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/authorship_verifier.py)
|
| 236 |
+
| Line | TODO | Status |
|
| 237 |
+
|------|------|--------|
|
| 238 |
+
| 14 | Implement initialisation (load model) | ✅ DONE |
|
| 239 |
+
| 19 | Implement authorship verification | ✅ DONE |
|
| 240 |
+
|
| 241 |
+
---
|
| 242 |
+
|
| 243 |
+
## src/inference/ — 3 TODOs ✅
|
| 244 |
+
|
| 245 |
+
### [corrector.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/inference/corrector.py)
|
| 246 |
+
| Line | TODO | Status |
|
| 247 |
+
|------|------|--------|
|
| 248 |
+
| 39 | Implement initialisation | ✅ DONE |
|
| 249 |
+
| 52 | Implement full correction pipeline | ✅ DONE |
|
| 250 |
+
|
| 251 |
+
### [postprocessor.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/inference/postprocessor.py)
|
| 252 |
+
| Line | TODO | Status |
|
| 253 |
+
|------|------|--------|
|
| 254 |
+
| 14 | Implement initialisation | ✅ DONE |
|
| 255 |
+
| 19 | Implement text cleanup | ✅ DONE |
|
| 256 |
+
| 27 | Implement entity restoration | ✅ DONE |
|
| 257 |
+
| 32 | Implement final formatting | ✅ DONE |
|
| 258 |
+
|
| 259 |
+
---
|
| 260 |
+
|
| 261 |
+
## src/api/ — 2 TODOs ✅
|
| 262 |
+
|
| 263 |
+
### [main.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/api/main.py)
|
| 264 |
+
| Line | TODO | Status |
|
| 265 |
+
|------|------|--------|
|
| 266 |
+
| 22 | Load config and initialise corrector on startup | ✅ DONE |
|
| 267 |
+
| 31 | Implement `/correct` endpoint | ✅ DONE |
|
| 268 |
+
|
| 269 |
+
### [middleware.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/api/middleware.py)
|
| 270 |
+
| Line | TODO | Status |
|
| 271 |
+
|------|------|--------|
|
| 272 |
+
| 14 | Implement request logging (timing, path, status) | ✅ DONE |
|
| 273 |
+
| 22 | Implement rate limiter state | ✅ DONE |
|
| 274 |
+
| 26 | Implement rate limiting logic | ✅ DONE |
|
| 275 |
+
|
| 276 |
+
---
|
| 277 |
+
|
| 278 |
+
## scripts/ — 5 TODOs ✅
|
| 279 |
+
|
| 280 |
+
### [train.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/train.py)
|
| 281 |
+
| Line | TODO | Status |
|
| 282 |
+
|------|------|--------|
|
| 283 |
+
| 24 | Implement training pipeline (10 steps) | ✅ DONE |
|
| 284 |
+
|
| 285 |
+
### [evaluate.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/evaluate.py)
|
| 286 |
+
| Line | TODO | Status |
|
| 287 |
+
|------|------|--------|
|
| 288 |
+
| 19 | Implement evaluation pipeline | ✅ DONE |
|
| 289 |
+
|
| 290 |
+
### [run_inference.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/run_inference.py)
|
| 291 |
+
| Line | TODO | Status |
|
| 292 |
+
|------|------|--------|
|
| 293 |
+
| 21 | Implement inference pipeline | ✅ DONE |
|
| 294 |
+
|
| 295 |
+
### [pretrain_human_pattern_classifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/pretrain_human_pattern_classifier.py)
|
| 296 |
+
| Line | TODO | Status |
|
| 297 |
+
|------|------|--------|
|
| 298 |
+
| 23 | Implement classifier pre-training | ✅ DONE |
|
| 299 |
+
|
| 300 |
+
---
|
| 301 |
+
|
| 302 |
+
## tests/ — 18 TODOs ✅
|
| 303 |
+
|
| 304 |
+
### [test_preprocessing.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_preprocessing.py) — 7 tests ✅
|
| 305 |
+
### [test_style.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_style.py) — 4 tests ✅
|
| 306 |
+
### [test_model.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_model.py) — 2 tests + 3 new ✅
|
| 307 |
+
### [test_vocabulary.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_vocabulary.py) — 4 tests ✅
|
| 308 |
+
### [test_evaluation.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_evaluation.py) — 4 tests ✅
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
## Shell Scripts ✅
|
| 313 |
+
|
| 314 |
+
| Script | Purpose |
|
| 315 |
+
|--------|---------|
|
| 316 |
+
| [train.sh](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/train.sh) | Multi-stage training with Skip/Redo/Continue checkpoint system |
|
| 317 |
+
| [start.sh](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/start.sh) | Inference launcher (CLI REPL or API server) |
|
| 318 |
+
|
| 319 |
+
---
|
| 320 |
+
|
| 321 |
+
## Summary by Package
|
| 322 |
+
|
| 323 |
+
| Package | TODOs | Status |
|
| 324 |
+
|---------|-------|--------|
|
| 325 |
+
| `src/preprocessing/` | 16 | ✅ ALL DONE |
|
| 326 |
+
| `src/style/` | 14 | ✅ ALL DONE |
|
| 327 |
+
| `src/model/` | 5 | ✅ ALL DONE |
|
| 328 |
+
| `src/training/` | 22 | ✅ ALL DONE |
|
| 329 |
+
| `src/vocabulary/` | 10 | ✅ ALL DONE |
|
| 330 |
+
| `src/evaluation/` | 7 | ✅ ALL DONE |
|
| 331 |
+
| `src/inference/` | 3 | ✅ ALL DONE |
|
| 332 |
+
| `src/api/` | 2 | ✅ ALL DONE |
|
| 333 |
+
| `scripts/` | 5 | ✅ ALL DONE |
|
| 334 |
+
| `tests/` | 18 | ✅ ALL DONE |
|
| 335 |
+
| **Total** | **97** | ✅ **ALL DONE** |
|
train.sh
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 3 |
+
# train.sh — Multi-stage training orchestrator with checkpoint system
|
| 4 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 5 |
+
#
|
| 6 |
+
# Usage: bash train.sh [--config CONFIG] [--auto]
|
| 7 |
+
#
|
| 8 |
+
# Each stage prompts: [S]kip, [R]edo, [C]ontinue
|
| 9 |
+
# Use --auto to skip all prompts and auto-detect what needs running
|
| 10 |
+
#
|
| 11 |
+
set -euo pipefail
|
| 12 |
+
|
| 13 |
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 14 |
+
export PYTHONPATH="${SCRIPT_DIR}:${PYTHONPATH:-}"
|
| 15 |
+
CONFIG="${1:-configs/training_config.yaml}"
|
| 16 |
+
AUTO_MODE=false
|
| 17 |
+
|
| 18 |
+
# Parse args
|
| 19 |
+
for arg in "$@"; do
|
| 20 |
+
case $arg in
|
| 21 |
+
--auto) AUTO_MODE=true ;;
|
| 22 |
+
--config=*) CONFIG="${arg#*=}" ;;
|
| 23 |
+
esac
|
| 24 |
+
done
|
| 25 |
+
|
| 26 |
+
# ── Colors ──────────────────────────────────────────────────────────────────
|
| 27 |
+
RED='\033[0;31m'
|
| 28 |
+
GREEN='\033[0;32m'
|
| 29 |
+
YELLOW='\033[1;33m'
|
| 30 |
+
CYAN='\033[0;36m'
|
| 31 |
+
BOLD='\033[1m'
|
| 32 |
+
NC='\033[0m'
|
| 33 |
+
|
| 34 |
+
info() { echo -e "${CYAN}[INFO]${NC} $1"; }
|
| 35 |
+
ok() { echo -e "${GREEN}[ OK]${NC} $1"; }
|
| 36 |
+
warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
| 37 |
+
err() { echo -e "${RED}[FAIL]${NC} $1"; }
|
| 38 |
+
|
| 39 |
+
# ── Stage prompt function ──────────────────────────────────────────────────
|
| 40 |
+
# Asks user to [S]kip, [R]edo, or [C]ontinue for each stage
|
| 41 |
+
prompt_stage() {
|
| 42 |
+
local stage_name="$1"
|
| 43 |
+
local check_file="$2" # File to check if stage already completed
|
| 44 |
+
|
| 45 |
+
echo ""
|
| 46 |
+
echo -e "${BOLD}═══ Stage: ${stage_name} ═══${NC}"
|
| 47 |
+
|
| 48 |
+
if [ "$AUTO_MODE" = true ]; then
|
| 49 |
+
if [ -n "$check_file" ] && [ -e "$check_file" ]; then
|
| 50 |
+
info "Auto-mode: $check_file exists, skipping"
|
| 51 |
+
return 1 # Skip
|
| 52 |
+
fi
|
| 53 |
+
return 0 # Continue
|
| 54 |
+
fi
|
| 55 |
+
|
| 56 |
+
if [ -n "$check_file" ] && [ -e "$check_file" ]; then
|
| 57 |
+
warn "Previous output found: $check_file"
|
| 58 |
+
echo -e " ${YELLOW}[S]${NC}kip | ${CYAN}[R]${NC}edo | ${GREEN}[C]${NC}ontinue"
|
| 59 |
+
read -rp " Choice [S/R/C]: " choice
|
| 60 |
+
case "${choice,,}" in
|
| 61 |
+
r|redo) info "Redoing ${stage_name}..."; return 0 ;;
|
| 62 |
+
c|continue) info "Continuing ${stage_name}..."; return 0 ;;
|
| 63 |
+
*) info "Skipping ${stage_name}"; return 1 ;;
|
| 64 |
+
esac
|
| 65 |
+
else
|
| 66 |
+
info "No previous output found. Running ${stage_name}..."
|
| 67 |
+
return 0
|
| 68 |
+
fi
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# ── Detect environment ─────────────────────────────────────────────────────
|
| 72 |
+
detect_env() {
|
| 73 |
+
echo -e "${BOLD}═══ Environment Detection ═══${NC}"
|
| 74 |
+
|
| 75 |
+
# Python
|
| 76 |
+
if command -v python3 &>/dev/null; then
|
| 77 |
+
PYTHON=python3
|
| 78 |
+
elif command -v python &>/dev/null; then
|
| 79 |
+
PYTHON=python
|
| 80 |
+
else
|
| 81 |
+
err "Python not found!"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
ok "Python: $($PYTHON --version 2>&1)"
|
| 85 |
+
|
| 86 |
+
# GPU
|
| 87 |
+
if $PYTHON -c "import torch; print(torch.cuda.is_available())" 2>/dev/null | grep -q "True"; then
|
| 88 |
+
GPU_AVAILABLE=true
|
| 89 |
+
GPU_NAME=$($PYTHON -c "import torch; print(torch.cuda.get_device_name(0))" 2>/dev/null || echo "Unknown")
|
| 90 |
+
ok "GPU: $GPU_NAME"
|
| 91 |
+
|
| 92 |
+
# Check compute capability for bf16
|
| 93 |
+
COMPUTE_CAP=$($PYTHON -c "import torch; print(torch.cuda.get_device_capability()[0])" 2>/dev/null || echo "0")
|
| 94 |
+
if [ "$COMPUTE_CAP" -ge 8 ]; then
|
| 95 |
+
PRECISION="bf16"
|
| 96 |
+
else
|
| 97 |
+
PRECISION="fp16"
|
| 98 |
+
fi
|
| 99 |
+
ok "Precision: $PRECISION"
|
| 100 |
+
else
|
| 101 |
+
GPU_AVAILABLE=false
|
| 102 |
+
PRECISION="fp32"
|
| 103 |
+
warn "No GPU detected — training will use CPU (optimised settings)"
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
# W&B
|
| 107 |
+
if [ -n "${WANDB_API_KEY:-}" ]; then
|
| 108 |
+
ok "W&B: API key found"
|
| 109 |
+
else
|
| 110 |
+
warn "W&B: No API key (WANDB_API_KEY). Logging to TensorBoard only."
|
| 111 |
+
export WANDB_DISABLED=true
|
| 112 |
+
fi
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 116 |
+
# STAGE 1: Install dependencies & download models
|
| 117 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 118 |
+
stage_1_setup() {
|
| 119 |
+
if prompt_stage "Setup & Dependencies" ".train_stage1_done"; then
|
| 120 |
+
info "Installing Python dependencies..."
|
| 121 |
+
$PYTHON -m pip install -r requirements.txt --quiet 2>&1 | tail -5
|
| 122 |
+
|
| 123 |
+
info "Downloading spaCy models..."
|
| 124 |
+
$PYTHON -m spacy download en_core_web_sm --quiet 2>/dev/null || true
|
| 125 |
+
|
| 126 |
+
info "Downloading NLTK data..."
|
| 127 |
+
$PYTHON -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('punkt_tab', quiet=True)" 2>/dev/null || true
|
| 128 |
+
|
| 129 |
+
touch .train_stage1_done
|
| 130 |
+
ok "Setup complete"
|
| 131 |
+
fi
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 135 |
+
# STAGE 2: Data preprocessing
|
| 136 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 137 |
+
stage_2_preprocess() {
|
| 138 |
+
if prompt_stage "Data Preprocessing" "data/processed/train.jsonl"; then
|
| 139 |
+
info "Preprocessing datasets into unified JSONL..."
|
| 140 |
+
$PYTHON scripts/preprocess_data.py
|
| 141 |
+
ok "Data preprocessing complete"
|
| 142 |
+
fi
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 146 |
+
# STAGE 3: Pre-train human pattern classifier
|
| 147 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 148 |
+
stage_3_pretrain_classifier() {
|
| 149 |
+
if prompt_stage "Pre-train Human Pattern Classifier" "checkpoints/human_pattern_classifier.pt"; then
|
| 150 |
+
info "Pre-training human pattern classifier on Kaggle datasets..."
|
| 151 |
+
info "This may take a while on CPU (extracting features for ~100k texts)..."
|
| 152 |
+
$PYTHON scripts/pretrain_human_pattern_classifier.py
|
| 153 |
+
ok "Human pattern classifier pre-trained"
|
| 154 |
+
fi
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 158 |
+
# STAGE 4: Main model training
|
| 159 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 160 |
+
stage_4_train() {
|
| 161 |
+
if prompt_stage "Main Model Training" "checkpoints/best_model/config.json"; then
|
| 162 |
+
info "Starting main model training..."
|
| 163 |
+
info "Config: $CONFIG"
|
| 164 |
+
|
| 165 |
+
# Add V2 loss flag if classifier exists
|
| 166 |
+
V2_FLAG=""
|
| 167 |
+
if [ -f "checkpoints/human_pattern_classifier.pt" ]; then
|
| 168 |
+
info "Human pattern classifier found — using V2 loss (with anti-AI term)"
|
| 169 |
+
V2_FLAG="--use-v2-loss"
|
| 170 |
+
fi
|
| 171 |
+
|
| 172 |
+
$PYTHON scripts/train.py --config "$CONFIG" $V2_FLAG
|
| 173 |
+
ok "Main training complete"
|
| 174 |
+
fi
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 178 |
+
# STAGE 5: Evaluation
|
| 179 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 180 |
+
stage_5_evaluate() {
|
| 181 |
+
if prompt_stage "Evaluation" "logs/eval_results_test.json"; then
|
| 182 |
+
info "Running evaluation on test set..."
|
| 183 |
+
mkdir -p logs
|
| 184 |
+
$PYTHON scripts/evaluate.py --config "$CONFIG" --split test
|
| 185 |
+
ok "Evaluation complete"
|
| 186 |
+
fi
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 190 |
+
# Main
|
| 191 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 192 |
+
main() {
|
| 193 |
+
echo ""
|
| 194 |
+
echo -e "${BOLD}╔══════════════════════════════════════════════════════════╗${NC}"
|
| 195 |
+
echo -e "${BOLD}║ Dyslexia Academic Writing Corrector — Training Suite ║${NC}"
|
| 196 |
+
echo -e "${BOLD}╚══════════════════════════════════════════════════════════╝${NC}"
|
| 197 |
+
echo ""
|
| 198 |
+
|
| 199 |
+
detect_env
|
| 200 |
+
|
| 201 |
+
stage_1_setup
|
| 202 |
+
stage_2_preprocess
|
| 203 |
+
stage_3_pretrain_classifier
|
| 204 |
+
stage_4_train
|
| 205 |
+
stage_5_evaluate
|
| 206 |
+
|
| 207 |
+
echo ""
|
| 208 |
+
echo -e "${GREEN}${BOLD}═══ All stages complete! ═══${NC}"
|
| 209 |
+
echo -e " Model saved to: ${CYAN}checkpoints/best_model/${NC}"
|
| 210 |
+
echo -e " Eval results: ${CYAN}logs/eval_results_test.json${NC}"
|
| 211 |
+
echo -e " Start inference: ${CYAN}bash start.sh${NC}"
|
| 212 |
+
echo ""
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
main
|
wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-05-03T12:41:31.910510511+05:30","level":"INFO","msg":"wandb-core"}
|
| 2 |
+
{"time":"2026-05-03T12:41:31.911235013+05:30","level":"INFO","msg":"stream: starting","core version":"0.26.1"}
|
| 3 |
+
{"time":"2026-05-03T12:41:32.640591639+05:30","level":"INFO","msg":"stream: created new stream","id":"7q4dwe22"}
|
| 4 |
+
{"time":"2026-05-03T12:41:32.640743705+05:30","level":"INFO","msg":"handler: started"}
|
| 5 |
+
{"time":"2026-05-03T12:41:32.64115088+05:30","level":"INFO","msg":"stream: started"}
|
| 6 |
+
{"time":"2026-05-03T12:41:32.641160468+05:30","level":"INFO","msg":"writer: started","stream_id":"7q4dwe22"}
|
| 7 |
+
{"time":"2026-05-03T12:41:32.641172701+05:30","level":"INFO","msg":"sender: started"}
|
| 8 |
+
{"time":"2026-05-03T12:41:33.623792544+05:30","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
|
| 9 |
+
{"time":"2026-05-03T12:41:34.381206382+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 10 |
+
{"time":"2026-05-03T12:41:48.6250478+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":0,"events_lines":2,"console_offset":0,"console_lines":32,"uploaded_len":2}
|
| 11 |
+
{"time":"2026-05-03T12:41:52.610177283+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 12 |
+
{"time":"2026-05-03T12:42:03.62427825+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":2,"events_lines":2,"console_offset":31,"console_lines":2}
|
| 13 |
+
{"time":"2026-05-03T12:42:04.079934308+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 14 |
+
{"time":"2026-05-03T12:42:18.624675392+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":4,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 15 |
+
{"time":"2026-05-03T12:42:19.131375894+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 16 |
+
{"time":"2026-05-03T12:42:33.624454986+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":6,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 17 |
+
{"time":"2026-05-03T12:42:34.185439368+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 18 |
+
{"time":"2026-05-03T12:42:48.624368649+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":8,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 19 |
+
{"time":"2026-05-03T12:42:52.050509317+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 20 |
+
{"time":"2026-05-03T12:43:03.624817069+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":10,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 21 |
+
{"time":"2026-05-03T12:43:04.189007008+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 22 |
+
{"time":"2026-05-03T12:43:18.624408595+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":12,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 23 |
+
{"time":"2026-05-03T12:43:19.03607342+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 24 |
+
{"time":"2026-05-03T12:43:33.624862786+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":0,"history_lines":1,"events_offset":14,"events_lines":2,"console_offset":32,"console_lines":2}
|
| 25 |
+
{"time":"2026-05-03T12:43:34.088654055+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 26 |
+
{"time":"2026-05-03T12:43:48.623936217+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":16,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 27 |
+
{"time":"2026-05-03T12:43:52.622306426+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 28 |
+
{"time":"2026-05-03T12:44:03.624968066+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":18,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 29 |
+
{"time":"2026-05-03T12:44:04.159531988+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 30 |
+
{"time":"2026-05-03T12:44:18.62395356+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":20,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 31 |
+
{"time":"2026-05-03T12:44:19.042602519+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 32 |
+
{"time":"2026-05-03T12:44:33.624505635+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":22,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 33 |
+
{"time":"2026-05-03T12:44:34.179444461+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 34 |
+
{"time":"2026-05-03T12:44:48.624294713+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":24,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 35 |
+
{"time":"2026-05-03T12:44:52.488535013+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 36 |
+
{"time":"2026-05-03T12:45:03.624694431+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":26,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 37 |
+
{"time":"2026-05-03T12:45:04.171236603+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 38 |
+
{"time":"2026-05-03T12:45:18.624353905+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":28,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 39 |
+
{"time":"2026-05-03T12:45:19.049334269+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 40 |
+
{"time":"2026-05-03T12:45:33.625499719+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":1,"history_lines":1,"events_offset":30,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 41 |
+
{"time":"2026-05-03T12:45:34.205775314+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 42 |
+
{"time":"2026-05-03T12:45:48.624246484+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":32,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 43 |
+
{"time":"2026-05-03T12:45:52.466463116+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 44 |
+
{"time":"2026-05-03T12:46:03.624377356+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":34,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 45 |
+
{"time":"2026-05-03T12:46:04.106028784+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 46 |
+
{"time":"2026-05-03T12:46:18.623990934+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":36,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 47 |
+
{"time":"2026-05-03T12:46:19.363307766+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 48 |
+
{"time":"2026-05-03T12:46:33.624399178+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":38,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 49 |
+
{"time":"2026-05-03T12:46:34.211508133+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 50 |
+
{"time":"2026-05-03T12:46:48.624496958+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":40,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 51 |
+
{"time":"2026-05-03T12:46:56.325382987+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 52 |
+
{"time":"2026-05-03T12:47:03.624347271+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":42,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 53 |
+
{"time":"2026-05-03T12:47:04.112261534+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 54 |
+
{"time":"2026-05-03T12:47:18.624559566+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":44,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 55 |
+
{"time":"2026-05-03T12:47:19.062715354+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 56 |
+
{"time":"2026-05-03T12:47:33.62485639+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":46,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 57 |
+
{"time":"2026-05-03T12:47:34.126644783+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 58 |
+
{"time":"2026-05-03T12:47:48.624876584+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":2,"history_lines":1,"events_offset":48,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 59 |
+
{"time":"2026-05-03T12:47:52.547877604+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 60 |
+
{"time":"2026-05-03T12:48:03.624169297+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":50,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 61 |
+
{"time":"2026-05-03T12:48:04.119370364+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 62 |
+
{"time":"2026-05-03T12:48:18.624748914+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":52,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 63 |
+
{"time":"2026-05-03T12:48:19.10634659+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 64 |
+
{"time":"2026-05-03T12:48:33.624565795+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":54,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 65 |
+
{"time":"2026-05-03T12:48:34.122699515+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 66 |
+
{"time":"2026-05-03T12:48:48.624545462+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":56,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 67 |
+
{"time":"2026-05-03T12:48:52.656977803+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 68 |
+
{"time":"2026-05-03T12:49:03.624596012+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":58,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 69 |
+
{"time":"2026-05-03T12:49:04.330825648+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 70 |
+
{"time":"2026-05-03T12:49:18.624564598+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":60,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 71 |
+
{"time":"2026-05-03T12:49:19.078491359+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 72 |
+
{"time":"2026-05-03T12:49:33.624629606+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":62,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 73 |
+
{"time":"2026-05-03T12:49:34.233481381+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 74 |
+
{"time":"2026-05-03T12:49:48.623896921+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":64,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 75 |
+
{"time":"2026-05-03T12:49:52.499893573+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 76 |
+
{"time":"2026-05-03T12:50:03.625175815+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":3,"history_lines":1,"events_offset":66,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 77 |
+
{"time":"2026-05-03T12:50:04.236709822+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 78 |
+
{"time":"2026-05-03T12:50:18.624165748+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":68,"events_lines":2,"console_offset":34,"console_lines":4}
|
| 79 |
+
{"time":"2026-05-03T12:50:19.084054207+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 80 |
+
{"time":"2026-05-03T12:50:33.624082116+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":70,"events_lines":2,"console_offset":37,"console_lines":1}
|
| 81 |
+
{"time":"2026-05-03T12:50:34.239458399+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 82 |
+
{"time":"2026-05-03T12:50:48.62427245+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":4,"history_lines":1,"events_offset":72,"events_lines":2,"console_offset":32,"console_lines":1}
|
| 83 |
+
{"time":"2026-05-03T12:50:52.159206398+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 84 |
+
{"time":"2026-05-03T12:51:03.624243519+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":74,"events_lines":2,"console_offset":37,"console_lines":8}
|
| 85 |
+
{"time":"2026-05-03T12:51:04.139955016+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 86 |
+
{"time":"2026-05-03T12:51:18.623551729+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":76,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 87 |
+
{"time":"2026-05-03T12:51:19.090345066+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 88 |
+
{"time":"2026-05-03T12:51:33.624706726+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":78,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 89 |
+
{"time":"2026-05-03T12:51:34.143803257+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 90 |
+
{"time":"2026-05-03T12:51:48.624581596+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":80,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 91 |
+
{"time":"2026-05-03T12:51:52.775577109+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 92 |
+
{"time":"2026-05-03T12:52:03.625946523+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":5,"history_lines":1,"events_offset":82,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 93 |
+
{"time":"2026-05-03T12:52:04.145798756+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 94 |
+
{"time":"2026-05-03T12:52:18.624567709+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":84,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 95 |
+
{"time":"2026-05-03T12:52:19.097700993+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 96 |
+
{"time":"2026-05-03T12:52:33.624587759+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":86,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 97 |
+
{"time":"2026-05-03T12:52:34.048968605+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 98 |
+
{"time":"2026-05-03T12:52:48.625017571+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":88,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 99 |
+
{"time":"2026-05-03T12:52:52.480420415+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 100 |
+
{"time":"2026-05-03T12:53:03.62479273+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":90,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 101 |
+
{"time":"2026-05-03T12:53:04.051441036+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 102 |
+
{"time":"2026-05-03T12:53:18.625071648+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":6,"history_lines":1,"events_offset":92,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 103 |
+
{"time":"2026-05-03T12:53:19.320213512+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 104 |
+
{"time":"2026-05-03T12:53:33.624825898+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":94,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 105 |
+
{"time":"2026-05-03T12:53:34.054856352+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 106 |
+
{"time":"2026-05-03T12:53:48.624712266+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":96,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 107 |
+
{"time":"2026-05-03T12:53:52.088255736+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 108 |
+
{"time":"2026-05-03T12:54:03.624162447+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":98,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 109 |
+
{"time":"2026-05-03T12:54:04.058358464+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 110 |
+
{"time":"2026-05-03T12:54:18.624352267+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":100,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 111 |
+
{"time":"2026-05-03T12:54:19.21327141+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 112 |
+
{"time":"2026-05-03T12:54:33.62520724+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":102,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 113 |
+
{"time":"2026-05-03T12:54:34.063683346+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 114 |
+
{"time":"2026-05-03T12:54:48.623773001+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":7,"history_lines":1,"events_offset":104,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 115 |
+
{"time":"2026-05-03T12:54:53.824628369+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 116 |
+
{"time":"2026-05-03T12:55:03.624869319+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":106,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 117 |
+
{"time":"2026-05-03T12:55:04.065156475+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 118 |
+
{"time":"2026-05-03T12:55:18.624737083+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":108,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 119 |
+
{"time":"2026-05-03T12:55:19.117949184+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 120 |
+
{"time":"2026-05-03T12:55:33.623963358+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":110,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 121 |
+
{"time":"2026-05-03T12:55:34.069701428+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 122 |
+
{"time":"2026-05-03T12:55:48.624698335+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":112,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 123 |
+
{"time":"2026-05-03T12:55:52.409930019+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 124 |
+
{"time":"2026-05-03T12:56:03.624554903+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":114,"events_lines":2,"console_offset":43,"console_lines":1}
|
| 125 |
+
{"time":"2026-05-03T12:56:04.481524049+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 126 |
+
{"time":"2026-05-03T12:56:18.624936254+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":8,"history_lines":1,"events_offset":116,"events_lines":2,"console_offset":43,"console_lines":6}
|
| 127 |
+
{"time":"2026-05-03T12:56:19.227360947+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 128 |
+
{"time":"2026-05-03T12:56:33.625250748+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":9,"history_lines":1,"events_offset":118,"events_lines":2,"console_offset":43,"console_lines":2}
|
| 129 |
+
{"time":"2026-05-03T12:56:34.280924135+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 130 |
+
{"time":"2026-05-03T12:56:48.623974918+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":120,"events_lines":2,"console_offset":49,"console_lines":7}
|
| 131 |
+
{"time":"2026-05-03T12:56:53.838367555+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 132 |
+
{"time":"2026-05-03T12:57:03.625033774+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":122,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 133 |
+
{"time":"2026-05-03T12:57:04.183208156+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 134 |
+
{"time":"2026-05-03T12:57:18.624445078+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":124,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 135 |
+
{"time":"2026-05-03T12:57:19.131798717+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 136 |
+
{"time":"2026-05-03T12:57:33.624908594+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":126,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 137 |
+
{"time":"2026-05-03T12:57:34.055298865+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 138 |
+
{"time":"2026-05-03T12:57:48.623865111+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":128,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 139 |
+
{"time":"2026-05-03T12:57:53.848616674+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 140 |
+
{"time":"2026-05-03T12:58:03.625210172+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":130,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 141 |
+
{"time":"2026-05-03T12:58:04.188844515+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 142 |
+
{"time":"2026-05-03T12:58:18.625197519+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":10,"history_lines":1,"events_offset":132,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 143 |
+
{"time":"2026-05-03T12:58:19.24219959+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 144 |
+
{"time":"2026-05-03T12:58:33.624258984+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":134,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 145 |
+
{"time":"2026-05-03T12:58:34.396259329+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 146 |
+
{"time":"2026-05-03T12:58:48.624940129+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":136,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 147 |
+
{"time":"2026-05-03T12:58:53.848822696+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 148 |
+
{"time":"2026-05-03T12:59:03.624295658+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":138,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 149 |
+
{"time":"2026-05-03T12:59:04.400379221+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 150 |
+
{"time":"2026-05-03T12:59:18.624665157+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":140,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 151 |
+
{"time":"2026-05-03T12:59:19.145926143+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 152 |
+
{"time":"2026-05-03T12:59:33.624817526+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":142,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 153 |
+
{"time":"2026-05-03T12:59:34.206762226+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 154 |
+
{"time":"2026-05-03T12:59:48.624390304+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":144,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 155 |
+
{"time":"2026-05-03T12:59:53.860559262+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 156 |
+
{"time":"2026-05-03T13:00:03.623875503+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":146,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 157 |
+
{"time":"2026-05-03T13:00:04.202208533+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 158 |
+
{"time":"2026-05-03T13:00:18.624929907+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":148,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 159 |
+
{"time":"2026-05-03T13:00:19.050716927+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 160 |
+
{"time":"2026-05-03T13:00:33.624476231+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":11,"history_lines":1,"events_offset":150,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 161 |
+
{"time":"2026-05-03T13:00:34.307214726+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 162 |
+
{"time":"2026-05-03T13:00:48.624634057+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":152,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 163 |
+
{"time":"2026-05-03T13:00:52.469169514+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 164 |
+
{"time":"2026-05-03T13:01:03.624924814+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":154,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 165 |
+
{"time":"2026-05-03T13:01:04.107064659+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 166 |
+
{"time":"2026-05-03T13:01:18.624653321+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":156,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 167 |
+
{"time":"2026-05-03T13:01:19.066067239+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 168 |
+
{"time":"2026-05-03T13:01:33.624502786+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":158,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 169 |
+
{"time":"2026-05-03T13:01:34.109698097+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 170 |
+
{"time":"2026-05-03T13:01:48.624767253+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":160,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 171 |
+
{"time":"2026-05-03T13:01:53.785694074+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 172 |
+
{"time":"2026-05-03T13:02:03.624903217+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":162,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 173 |
+
{"time":"2026-05-03T13:02:05.035353094+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 174 |
+
{"time":"2026-05-03T13:02:18.624580087+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":164,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 175 |
+
{"time":"2026-05-03T13:02:19.078738091+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 176 |
+
{"time":"2026-05-03T13:02:33.624211562+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":166,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 177 |
+
{"time":"2026-05-03T13:02:34.116294968+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 178 |
+
{"time":"2026-05-03T13:02:48.62498625+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":12,"history_lines":1,"events_offset":168,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 179 |
+
{"time":"2026-05-03T13:02:52.549222964+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 180 |
+
{"time":"2026-05-03T13:03:03.623912797+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":170,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 181 |
+
{"time":"2026-05-03T13:03:04.118997416+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 182 |
+
{"time":"2026-05-03T13:03:18.62405338+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":172,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 183 |
+
{"time":"2026-05-03T13:03:19.20998837+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 184 |
+
{"time":"2026-05-03T13:03:33.624613805+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":174,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 185 |
+
{"time":"2026-05-03T13:03:34.123314056+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 186 |
+
{"time":"2026-05-03T13:03:48.624508585+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":176,"events_lines":2,"console_offset":54,"console_lines":1}
|
| 187 |
+
{"time":"2026-05-03T13:03:53.78511401+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 188 |
+
{"time":"2026-05-03T13:04:03.625266311+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":13,"history_lines":1,"events_offset":178,"events_lines":2,"console_offset":54,"console_lines":6}
|
| 189 |
+
{"time":"2026-05-03T13:04:04.043331726+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 190 |
+
{"time":"2026-05-03T13:04:18.625263545+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":14,"history_lines":1,"events_offset":180,"events_lines":2,"console_offset":54,"console_lines":2}
|
| 191 |
+
{"time":"2026-05-03T13:04:19.076967316+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 192 |
+
{"time":"2026-05-03T13:04:33.624870854+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":182,"events_lines":2,"console_offset":60,"console_lines":7}
|
| 193 |
+
{"time":"2026-05-03T13:04:34.232591774+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 194 |
+
{"time":"2026-05-03T13:04:48.624554166+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":184,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 195 |
+
{"time":"2026-05-03T13:04:53.893576903+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 196 |
+
{"time":"2026-05-03T13:05:03.624387682+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":186,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 197 |
+
{"time":"2026-05-03T13:05:04.132708966+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 198 |
+
{"time":"2026-05-03T13:05:18.624056957+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":188,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 199 |
+
{"time":"2026-05-03T13:05:19.084294163+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 200 |
+
{"time":"2026-05-03T13:05:33.623642485+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":15,"history_lines":1,"events_offset":190,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 201 |
+
{"time":"2026-05-03T13:05:34.135980166+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 202 |
+
{"time":"2026-05-03T13:05:48.624842204+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":192,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 203 |
+
{"time":"2026-05-03T13:05:53.790732523+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 204 |
+
{"time":"2026-05-03T13:06:03.624205493+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":194,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 205 |
+
{"time":"2026-05-03T13:06:04.016288572+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 206 |
+
{"time":"2026-05-03T13:06:18.624981694+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":196,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 207 |
+
{"time":"2026-05-03T13:06:19.397699848+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 208 |
+
{"time":"2026-05-03T13:06:33.623935241+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":198,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 209 |
+
{"time":"2026-05-03T13:06:34.044819946+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 210 |
+
{"time":"2026-05-03T13:06:48.624728354+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":200,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 211 |
+
{"time":"2026-05-03T13:06:49.883946156+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 212 |
+
{"time":"2026-05-03T13:07:03.625032345+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":202,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 213 |
+
{"time":"2026-05-03T13:07:04.251028411+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 214 |
+
{"time":"2026-05-03T13:07:18.625057902+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":16,"history_lines":1,"events_offset":204,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 215 |
+
{"time":"2026-05-03T13:07:19.426577362+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 216 |
+
{"time":"2026-05-03T13:07:33.623808377+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":206,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 217 |
+
{"time":"2026-05-03T13:07:34.150919588+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 218 |
+
{"time":"2026-05-03T13:07:48.624900844+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":208,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 219 |
+
{"time":"2026-05-03T13:07:53.732010245+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 220 |
+
{"time":"2026-05-03T13:08:03.624036094+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":210,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 221 |
+
{"time":"2026-05-03T13:08:04.153782378+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 222 |
+
{"time":"2026-05-03T13:08:18.623716099+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":212,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 223 |
+
{"time":"2026-05-03T13:08:19.213228745+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 224 |
+
{"time":"2026-05-03T13:08:33.625509812+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":17,"history_lines":1,"events_offset":214,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 225 |
+
{"time":"2026-05-03T13:08:34.273258552+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 226 |
+
{"time":"2026-05-03T13:08:48.623779221+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":216,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 227 |
+
{"time":"2026-05-03T13:08:53.74462565+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 228 |
+
{"time":"2026-05-03T13:09:03.624590421+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":218,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 229 |
+
{"time":"2026-05-03T13:09:04.265483888+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 230 |
+
{"time":"2026-05-03T13:09:18.624592677+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":220,"events_lines":2,"console_offset":65,"console_lines":1}
|
| 231 |
+
{"time":"2026-05-03T13:09:19.111922405+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 232 |
+
{"time":"2026-05-03T13:09:33.624187264+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":18,"history_lines":1,"events_offset":222,"events_lines":2,"console_offset":65,"console_lines":6}
|
| 233 |
+
{"time":"2026-05-03T13:09:34.06163637+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 234 |
+
{"time":"2026-05-03T13:09:48.62435919+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":19,"history_lines":1,"events_offset":224,"events_lines":2,"console_offset":65,"console_lines":2}
|
| 235 |
+
{"time":"2026-05-03T13:09:53.928179314+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 236 |
+
{"time":"2026-05-03T13:10:03.624241639+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":226,"events_lines":2,"console_offset":71,"console_lines":7}
|
| 237 |
+
{"time":"2026-05-03T13:10:04.269883116+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 238 |
+
{"time":"2026-05-03T13:10:18.623698326+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":228,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 239 |
+
{"time":"2026-05-03T13:10:19.220580256+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 240 |
+
{"time":"2026-05-03T13:10:33.625208324+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":230,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 241 |
+
{"time":"2026-05-03T13:10:34.377376961+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 242 |
+
{"time":"2026-05-03T13:10:48.624352793+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":232,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 243 |
+
{"time":"2026-05-03T13:10:53.832676215+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 244 |
+
{"time":"2026-05-03T13:11:03.624982324+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":20,"history_lines":1,"events_offset":234,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 245 |
+
{"time":"2026-05-03T13:11:04.079992269+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 246 |
+
{"time":"2026-05-03T13:11:18.62489984+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":236,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 247 |
+
{"time":"2026-05-03T13:11:19.027282435+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 248 |
+
{"time":"2026-05-03T13:11:33.624563879+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":238,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 249 |
+
{"time":"2026-05-03T13:11:34.041564944+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 250 |
+
{"time":"2026-05-03T13:11:48.624587725+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":240,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 251 |
+
{"time":"2026-05-03T13:11:53.917050708+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 252 |
+
{"time":"2026-05-03T13:12:03.62517304+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":242,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 253 |
+
{"time":"2026-05-03T13:12:04.092490879+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 254 |
+
{"time":"2026-05-03T13:12:18.624426504+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":244,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 255 |
+
{"time":"2026-05-03T13:12:19.136291819+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 256 |
+
{"time":"2026-05-03T13:12:33.62422219+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":246,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 257 |
+
{"time":"2026-05-03T13:12:34.159602779+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 258 |
+
{"time":"2026-05-03T13:12:48.625348431+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":21,"history_lines":1,"events_offset":248,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 259 |
+
{"time":"2026-05-03T13:12:52.515794539+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 260 |
+
{"time":"2026-05-03T13:13:03.624617124+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":250,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 261 |
+
{"time":"2026-05-03T13:13:04.189876876+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 262 |
+
{"time":"2026-05-03T13:13:18.624355863+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":252,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 263 |
+
{"time":"2026-05-03T13:13:19.242568869+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 264 |
+
{"time":"2026-05-03T13:13:33.62437469+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":254,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 265 |
+
{"time":"2026-05-03T13:13:34.203229293+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 266 |
+
{"time":"2026-05-03T13:13:48.624058475+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":22,"history_lines":1,"events_offset":256,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 267 |
+
{"time":"2026-05-03T13:13:52.522178792+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 268 |
+
{"time":"2026-05-03T13:14:03.624159107+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":258,"events_lines":2,"console_offset":76,"console_lines":1}
|
| 269 |
+
{"time":"2026-05-03T13:14:04.197766657+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 270 |
+
{"time":"2026-05-03T13:14:18.624297209+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":23,"history_lines":1,"events_offset":260,"events_lines":2,"console_offset":76,"console_lines":6}
|
| 271 |
+
{"time":"2026-05-03T13:14:19.249825938+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 272 |
+
{"time":"2026-05-03T13:14:33.62367618+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":24,"history_lines":1,"events_offset":262,"events_lines":2,"console_offset":76,"console_lines":2}
|
| 273 |
+
{"time":"2026-05-03T13:14:34.200330044+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 274 |
+
{"time":"2026-05-03T13:14:48.623745102+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":264,"events_lines":2,"console_offset":82,"console_lines":7}
|
| 275 |
+
{"time":"2026-05-03T13:14:52.630682314+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 276 |
+
{"time":"2026-05-03T13:15:03.625600932+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":25,"history_lines":1,"events_offset":266,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 277 |
+
{"time":"2026-05-03T13:15:04.10091133+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 278 |
+
{"time":"2026-05-03T13:15:18.624529784+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":268,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 279 |
+
{"time":"2026-05-03T13:15:19.358910816+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 280 |
+
{"time":"2026-05-03T13:15:33.624544924+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":270,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 281 |
+
{"time":"2026-05-03T13:15:34.104052485+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 282 |
+
{"time":"2026-05-03T13:15:48.624128201+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":272,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 283 |
+
{"time":"2026-05-03T13:15:52.637656189+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 284 |
+
{"time":"2026-05-03T13:16:03.624463395+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":274,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 285 |
+
{"time":"2026-05-03T13:16:04.215823444+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 286 |
+
{"time":"2026-05-03T13:16:18.624989849+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":26,"history_lines":1,"events_offset":276,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 287 |
+
{"time":"2026-05-03T13:16:19.263366851+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 288 |
+
{"time":"2026-05-03T13:16:33.624681349+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":278,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 289 |
+
{"time":"2026-05-03T13:16:34.109971882+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 290 |
+
{"time":"2026-05-03T13:16:48.624154227+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":280,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 291 |
+
{"time":"2026-05-03T13:16:53.911552782+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 292 |
+
{"time":"2026-05-03T13:17:03.624780158+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":282,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 293 |
+
{"time":"2026-05-03T13:17:04.114449708+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 294 |
+
{"time":"2026-05-03T13:17:18.623982256+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":284,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 295 |
+
{"time":"2026-05-03T13:17:19.064687832+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 296 |
+
{"time":"2026-05-03T13:17:33.624478322+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":27,"history_lines":1,"events_offset":286,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 297 |
+
{"time":"2026-05-03T13:17:34.117543273+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 298 |
+
{"time":"2026-05-03T13:17:48.624168179+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":288,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 299 |
+
{"time":"2026-05-03T13:17:52.550289777+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 300 |
+
{"time":"2026-05-03T13:18:03.624722476+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":290,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 301 |
+
{"time":"2026-05-03T13:18:04.020487283+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 302 |
+
{"time":"2026-05-03T13:18:18.624886096+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":292,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 303 |
+
{"time":"2026-05-03T13:18:19.07233748+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 304 |
+
{"time":"2026-05-03T13:18:33.625368527+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":294,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 305 |
+
{"time":"2026-05-03T13:18:34.124776857+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 306 |
+
{"time":"2026-05-03T13:18:48.624611181+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":296,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 307 |
+
{"time":"2026-05-03T13:18:52.419268795+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 308 |
+
{"time":"2026-05-03T13:19:03.624283998+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":298,"events_lines":2,"console_offset":87,"console_lines":1}
|
| 309 |
+
{"time":"2026-05-03T13:19:04.230735811+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 310 |
+
{"time":"2026-05-03T13:19:18.624409847+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":28,"history_lines":1,"events_offset":300,"events_lines":2,"console_offset":87,"console_lines":6}
|
| 311 |
+
{"time":"2026-05-03T13:19:19.180387589+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 312 |
+
{"time":"2026-05-03T13:19:33.624822037+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":29,"history_lines":1,"events_offset":302,"events_lines":2,"console_offset":87,"console_lines":2}
|
| 313 |
+
{"time":"2026-05-03T13:19:34.132064256+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 314 |
+
{"time":"2026-05-03T13:19:48.624533775+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":304,"events_lines":2,"console_offset":93,"console_lines":7}
|
| 315 |
+
{"time":"2026-05-03T13:19:52.564304075+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 316 |
+
{"time":"2026-05-03T13:20:03.623942702+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":306,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 317 |
+
{"time":"2026-05-03T13:20:04.136088386+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 318 |
+
{"time":"2026-05-03T13:20:18.62579199+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":308,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 319 |
+
{"time":"2026-05-03T13:20:19.085933299+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 320 |
+
{"time":"2026-05-03T13:20:33.624760978+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":310,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 321 |
+
{"time":"2026-05-03T13:20:34.350997003+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 322 |
+
{"time":"2026-05-03T13:20:48.624959899+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":30,"history_lines":1,"events_offset":312,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 323 |
+
{"time":"2026-05-03T13:20:53.800045176+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 324 |
+
{"time":"2026-05-03T13:21:03.624629627+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":314,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 325 |
+
{"time":"2026-05-03T13:21:04.347251759+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 326 |
+
{"time":"2026-05-03T13:21:18.624838853+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":316,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 327 |
+
{"time":"2026-05-03T13:21:19.19502873+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 328 |
+
{"time":"2026-05-03T13:21:33.624780545+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":318,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 329 |
+
{"time":"2026-05-03T13:21:34.146731155+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 330 |
+
{"time":"2026-05-03T13:21:48.623883165+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":320,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 331 |
+
{"time":"2026-05-03T13:21:53.802198181+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 332 |
+
{"time":"2026-05-03T13:22:03.623771205+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":31,"history_lines":1,"events_offset":322,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 333 |
+
{"time":"2026-05-03T13:22:04.25178541+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 334 |
+
{"time":"2026-05-03T13:22:18.62418141+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":324,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 335 |
+
{"time":"2026-05-03T13:22:19.203144297+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 336 |
+
{"time":"2026-05-03T13:22:33.624683769+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":32,"history_lines":1,"events_offset":326,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 337 |
+
{"time":"2026-05-03T13:22:34.152272658+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 338 |
+
{"time":"2026-05-03T13:22:48.624033114+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":328,"events_lines":2,"console_offset":98,"console_lines":1}
|
| 339 |
+
{"time":"2026-05-03T13:22:52.585121776+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 340 |
+
{"time":"2026-05-03T13:23:03.624122754+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":33,"history_lines":1,"events_offset":330,"events_lines":2,"console_offset":98,"console_lines":6}
|
| 341 |
+
{"time":"2026-05-03T13:23:04.258539089+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 342 |
+
{"time":"2026-05-03T13:23:18.624268783+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":34,"history_lines":1,"events_offset":332,"events_lines":2,"console_offset":98,"console_lines":2}
|
| 343 |
+
{"time":"2026-05-03T13:23:19.105851896+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 344 |
+
{"time":"2026-05-03T13:23:33.624268784+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":334,"events_lines":2,"console_offset":104,"console_lines":7}
|
| 345 |
+
{"time":"2026-05-03T13:23:34.158773733+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 346 |
+
{"time":"2026-05-03T13:23:48.624502957+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":336,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 347 |
+
{"time":"2026-05-03T13:23:52.488571215+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 348 |
+
{"time":"2026-05-03T13:24:03.624346324+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":35,"history_lines":1,"events_offset":338,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 349 |
+
{"time":"2026-05-03T13:24:04.093442826+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 350 |
+
{"time":"2026-05-03T13:24:18.623881299+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":340,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 351 |
+
{"time":"2026-05-03T13:24:19.215891895+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 352 |
+
{"time":"2026-05-03T13:24:33.624223866+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":36,"history_lines":1,"events_offset":342,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 353 |
+
{"time":"2026-05-03T13:24:34.165786211+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 354 |
+
{"time":"2026-05-03T13:24:48.624101056+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":344,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 355 |
+
{"time":"2026-05-03T13:24:52.418222751+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 356 |
+
{"time":"2026-05-03T13:25:03.624050062+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":37,"history_lines":1,"events_offset":346,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 357 |
+
{"time":"2026-05-03T13:25:04.077447927+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 358 |
+
{"time":"2026-05-03T13:25:18.623724289+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":348,"events_lines":2,"console_offset":109,"console_lines":1}
|
| 359 |
+
{"time":"2026-05-03T13:25:19.223123465+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 360 |
+
{"time":"2026-05-03T13:25:33.623791077+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":38,"history_lines":1,"events_offset":350,"events_lines":2,"console_offset":109,"console_lines":6}
|
| 361 |
+
{"time":"2026-05-03T13:25:34.070406215+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 362 |
+
{"time":"2026-05-03T13:25:48.623830574+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":39,"history_lines":1,"events_offset":352,"events_lines":2,"console_offset":109,"console_lines":2}
|
| 363 |
+
{"time":"2026-05-03T13:25:52.510007264+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 364 |
+
{"time":"2026-05-03T13:26:03.623789151+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":354,"events_lines":2,"console_offset":115,"console_lines":7}
|
| 365 |
+
{"time":"2026-05-03T13:26:04.176987671+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 366 |
+
{"time":"2026-05-03T13:26:18.624463038+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":40,"history_lines":1,"events_offset":356,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 367 |
+
{"time":"2026-05-03T13:26:19.126789241+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 368 |
+
{"time":"2026-05-03T13:26:33.623945558+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":358,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 369 |
+
{"time":"2026-05-03T13:26:34.078975811+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 370 |
+
{"time":"2026-05-03T13:26:48.62452134+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":41,"history_lines":1,"events_offset":360,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 371 |
+
{"time":"2026-05-03T13:26:54.048538082+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 372 |
+
{"time":"2026-05-03T13:27:03.623969961+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":362,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 373 |
+
{"time":"2026-05-03T13:27:04.006756603+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 374 |
+
{"time":"2026-05-03T13:27:18.6247665+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":364,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 375 |
+
{"time":"2026-05-03T13:27:19.032623956+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 376 |
+
{"time":"2026-05-03T13:27:33.624465658+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":42,"history_lines":1,"events_offset":366,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 377 |
+
{"time":"2026-05-03T13:27:34.195667306+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 378 |
+
{"time":"2026-05-03T13:27:48.624979333+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":368,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 379 |
+
{"time":"2026-05-03T13:27:52.618971883+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 380 |
+
{"time":"2026-05-03T13:28:03.623830561+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":370,"events_lines":2,"console_offset":120,"console_lines":1}
|
| 381 |
+
{"time":"2026-05-03T13:28:04.089199692+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 382 |
+
{"time":"2026-05-03T13:28:18.623857839+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":43,"history_lines":1,"events_offset":372,"events_lines":2,"console_offset":120,"console_lines":6}
|
| 383 |
+
{"time":"2026-05-03T13:28:19.013505636+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 384 |
+
{"time":"2026-05-03T13:28:33.623790758+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":44,"history_lines":1,"events_offset":374,"events_lines":2,"console_offset":120,"console_lines":2}
|
| 385 |
+
{"time":"2026-05-03T13:28:34.177926015+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 386 |
+
{"time":"2026-05-03T13:28:48.624002804+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":376,"events_lines":2,"console_offset":126,"console_lines":7}
|
| 387 |
+
{"time":"2026-05-03T13:28:53.754090339+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 388 |
+
{"time":"2026-05-03T13:29:03.624437751+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":378,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 389 |
+
{"time":"2026-05-03T13:29:04.204898109+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 390 |
+
{"time":"2026-05-03T13:29:18.624210007+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":45,"history_lines":1,"events_offset":380,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 391 |
+
{"time":"2026-05-03T13:29:19.046606716+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 392 |
+
{"time":"2026-05-03T13:29:33.624220653+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":382,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 393 |
+
{"time":"2026-05-03T13:29:34.202729749+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 394 |
+
{"time":"2026-05-03T13:29:48.62385034+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":46,"history_lines":1,"events_offset":384,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 395 |
+
{"time":"2026-05-03T13:29:53.862105869+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 396 |
+
{"time":"2026-05-03T13:30:03.624007753+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":386,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 397 |
+
{"time":"2026-05-03T13:30:04.103026989+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 398 |
+
{"time":"2026-05-03T13:30:18.624599295+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":47,"history_lines":1,"events_offset":388,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 399 |
+
{"time":"2026-05-03T13:30:19.052388549+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 400 |
+
{"time":"2026-05-03T13:30:33.623750158+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":390,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 401 |
+
{"time":"2026-05-03T13:30:34.105389648+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 402 |
+
{"time":"2026-05-03T13:30:48.625066437+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":392,"events_lines":2,"console_offset":131,"console_lines":1}
|
| 403 |
+
{"time":"2026-05-03T13:30:53.76766617+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 404 |
+
{"time":"2026-05-03T13:31:03.623820021+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":48,"history_lines":1,"events_offset":394,"events_lines":2,"console_offset":131,"console_lines":6}
|
| 405 |
+
{"time":"2026-05-03T13:31:04.212136382+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 406 |
+
{"time":"2026-05-03T13:31:18.624132197+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":49,"history_lines":1,"events_offset":396,"events_lines":2,"console_offset":131,"console_lines":2}
|
| 407 |
+
{"time":"2026-05-03T13:31:19.060752483+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 408 |
+
{"time":"2026-05-03T13:31:33.623935802+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":398,"events_lines":2,"console_offset":137,"console_lines":7}
|
| 409 |
+
{"time":"2026-05-03T13:31:34.113303634+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 410 |
+
{"time":"2026-05-03T13:31:48.624087183+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":50,"history_lines":1,"events_offset":400,"events_lines":2,"console_offset":142,"console_lines":1}
|
| 411 |
+
{"time":"2026-05-03T13:31:53.77424155+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 412 |
+
{"time":"2026-05-03T13:32:03.624427221+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":402,"events_lines":2,"console_offset":142,"console_lines":1}
|
| 413 |
+
{"time":"2026-05-03T13:32:04.116482624+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 414 |
+
{"time":"2026-05-03T13:32:18.624173217+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":51,"history_lines":1,"events_offset":404,"events_lines":2,"console_offset":142,"console_lines":1}
|
| 415 |
+
{"time":"2026-05-03T13:32:19.068688911+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 416 |
+
{"time":"2026-05-03T13:32:33.624454974+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":406,"events_lines":2,"console_offset":142,"console_lines":1}
|
| 417 |
+
{"time":"2026-05-03T13:32:34.120032583+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 418 |
+
{"time":"2026-05-03T13:32:48.624562713+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":52,"history_lines":1,"events_offset":408,"events_lines":2,"console_offset":142,"console_lines":1}
|
| 419 |
+
{"time":"2026-05-03T13:32:53.790901488+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 420 |
+
{"time":"2026-05-03T13:33:03.623780704+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":410,"events_lines":2,"console_offset":142,"console_lines":1}
|
| 421 |
+
{"time":"2026-05-03T13:33:04.227727807+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 422 |
+
{"time":"2026-05-03T13:33:18.62449669+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":53,"history_lines":1,"events_offset":412,"events_lines":2,"console_offset":142,"console_lines":6}
|
| 423 |
+
{"time":"2026-05-03T13:33:19.074032897+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 424 |
+
{"time":"2026-05-03T13:33:33.623647814+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":54,"history_lines":1,"events_offset":414,"events_lines":2,"console_offset":142,"console_lines":2}
|
| 425 |
+
{"time":"2026-05-03T13:33:34.538702583+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 426 |
+
{"time":"2026-05-03T13:33:48.623834252+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":416,"events_lines":2,"console_offset":148,"console_lines":7}
|
| 427 |
+
{"time":"2026-05-03T13:33:53.786642254+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 428 |
+
{"time":"2026-05-03T13:34:03.623944615+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":418,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 429 |
+
{"time":"2026-05-03T13:34:04.542055301+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 430 |
+
{"time":"2026-05-03T13:34:18.624315311+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":55,"history_lines":1,"events_offset":420,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 431 |
+
{"time":"2026-05-03T13:34:19.087548151+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 432 |
+
{"time":"2026-05-03T13:34:33.624071687+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":422,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 433 |
+
{"time":"2026-05-03T13:34:34.662667927+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 434 |
+
{"time":"2026-05-03T13:34:48.62427173+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":56,"history_lines":1,"events_offset":424,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 435 |
+
{"time":"2026-05-03T13:34:50.512616498+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 436 |
+
{"time":"2026-05-03T13:35:03.624076367+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":426,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 437 |
+
{"time":"2026-05-03T13:35:04.44636868+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 438 |
+
{"time":"2026-05-03T13:35:18.624696413+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":428,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 439 |
+
{"time":"2026-05-03T13:35:19.11344712+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 440 |
+
{"time":"2026-05-03T13:35:33.624072402+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":430,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 441 |
+
{"time":"2026-05-03T13:35:34.143662028+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 442 |
+
{"time":"2026-05-03T13:35:48.62468305+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":57,"history_lines":1,"events_offset":432,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 443 |
+
{"time":"2026-05-03T13:35:53.802858739+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 444 |
+
{"time":"2026-05-03T13:36:03.624546268+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":434,"events_lines":2,"console_offset":153,"console_lines":1}
|
| 445 |
+
{"time":"2026-05-03T13:36:04.14406513+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 446 |
+
{"time":"2026-05-03T13:36:18.624111624+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":58,"history_lines":1,"events_offset":436,"events_lines":2,"console_offset":153,"console_lines":6}
|
| 447 |
+
{"time":"2026-05-03T13:36:19.096099413+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 448 |
+
{"time":"2026-05-03T13:36:33.624290887+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":59,"history_lines":1,"events_offset":438,"events_lines":2,"console_offset":153,"console_lines":2}
|
| 449 |
+
{"time":"2026-05-03T13:36:34.257147799+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 450 |
+
{"time":"2026-05-03T13:36:48.624306903+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":440,"events_lines":2,"console_offset":159,"console_lines":7}
|
| 451 |
+
{"time":"2026-05-03T13:36:52.478397774+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 452 |
+
{"time":"2026-05-03T13:37:03.623946629+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":60,"history_lines":1,"events_offset":442,"events_lines":2,"console_offset":164,"console_lines":1}
|
| 453 |
+
{"time":"2026-05-03T13:37:04.151079888+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 454 |
+
{"time":"2026-05-03T13:37:18.624500991+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":444,"events_lines":2,"console_offset":164,"console_lines":1}
|
| 455 |
+
{"time":"2026-05-03T13:37:19.204796067+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 456 |
+
{"time":"2026-05-03T13:37:33.624178719+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":61,"history_lines":1,"events_offset":446,"events_lines":2,"console_offset":164,"console_lines":1}
|
| 457 |
+
{"time":"2026-05-03T13:37:34.256997889+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 458 |
+
{"time":"2026-05-03T13:37:48.624482097+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":448,"events_lines":2,"console_offset":164,"console_lines":1}
|
| 459 |
+
{"time":"2026-05-03T13:37:53.920905508+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 460 |
+
{"time":"2026-05-03T13:38:03.624627196+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":62,"history_lines":1,"events_offset":450,"events_lines":2,"console_offset":164,"console_lines":1}
|
| 461 |
+
{"time":"2026-05-03T13:38:04.158192542+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 462 |
+
{"time":"2026-05-03T13:38:18.623873971+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":452,"events_lines":2,"console_offset":164,"console_lines":1}
|
| 463 |
+
{"time":"2026-05-03T13:38:19.10989055+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 464 |
+
{"time":"2026-05-03T13:38:33.624647717+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":63,"history_lines":1,"events_offset":454,"events_lines":2,"console_offset":164,"console_lines":6}
|
| 465 |
+
{"time":"2026-05-03T13:38:34.06006327+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 466 |
+
{"time":"2026-05-03T13:38:48.624391972+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":64,"history_lines":1,"events_offset":456,"events_lines":2,"console_offset":164,"console_lines":2}
|
| 467 |
+
{"time":"2026-05-03T13:38:52.511278211+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 468 |
+
{"time":"2026-05-03T13:39:03.62468486+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":458,"events_lines":2,"console_offset":170,"console_lines":7}
|
| 469 |
+
{"time":"2026-05-03T13:39:04.063896486+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 470 |
+
{"time":"2026-05-03T13:39:18.625250663+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":460,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 471 |
+
{"time":"2026-05-03T13:39:19.117425843+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 472 |
+
{"time":"2026-05-03T13:39:33.62476815+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":462,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 473 |
+
{"time":"2026-05-03T13:39:34.17127823+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 474 |
+
{"time":"2026-05-03T13:39:48.62534124+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":464,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 475 |
+
{"time":"2026-05-03T13:39:53.832331787+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 476 |
+
{"time":"2026-05-03T13:40:03.624605738+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":466,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 477 |
+
{"time":"2026-05-03T13:40:04.284207372+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 478 |
+
{"time":"2026-05-03T13:40:18.625015777+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":65,"history_lines":1,"events_offset":468,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 479 |
+
{"time":"2026-05-03T13:40:19.328978383+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 480 |
+
{"time":"2026-05-03T13:40:33.624370445+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":470,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 481 |
+
{"time":"2026-05-03T13:40:34.382233692+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 482 |
+
{"time":"2026-05-03T13:40:48.625233991+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":472,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 483 |
+
{"time":"2026-05-03T13:40:52.447473984+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 484 |
+
{"time":"2026-05-03T13:41:03.624167966+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":474,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 485 |
+
{"time":"2026-05-03T13:41:04.079693644+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 486 |
+
{"time":"2026-05-03T13:41:18.624191436+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":476,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 487 |
+
{"time":"2026-05-03T13:41:19.129757502+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 488 |
+
{"time":"2026-05-03T13:41:33.624909006+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":478,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 489 |
+
{"time":"2026-05-03T13:41:34.184623991+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 490 |
+
{"time":"2026-05-03T13:41:48.624963284+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":66,"history_lines":1,"events_offset":480,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 491 |
+
{"time":"2026-05-03T13:41:52.719060595+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 492 |
+
{"time":"2026-05-03T13:42:03.625876072+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":482,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 493 |
+
{"time":"2026-05-03T13:42:04.187434496+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 494 |
+
{"time":"2026-05-03T13:42:18.626563405+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":484,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 495 |
+
{"time":"2026-05-03T13:42:19.137988786+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 496 |
+
{"time":"2026-05-03T13:42:33.624934109+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":486,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 497 |
+
{"time":"2026-05-03T13:42:34.204316513+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 498 |
+
{"time":"2026-05-03T13:42:48.624833123+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":488,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 499 |
+
{"time":"2026-05-03T13:42:52.521199735+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 500 |
+
{"time":"2026-05-03T13:43:03.624834571+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":67,"history_lines":1,"events_offset":490,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 501 |
+
{"time":"2026-05-03T13:43:04.19529391+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 502 |
+
{"time":"2026-05-03T13:43:18.624997373+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":492,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 503 |
+
{"time":"2026-05-03T13:43:19.145068366+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 504 |
+
{"time":"2026-05-03T13:43:33.624157195+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":494,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 505 |
+
{"time":"2026-05-03T13:43:34.20597247+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 506 |
+
{"time":"2026-05-03T13:43:48.624978949+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":496,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 507 |
+
{"time":"2026-05-03T13:43:53.858854775+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 508 |
+
{"time":"2026-05-03T13:44:03.623885144+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":498,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 509 |
+
{"time":"2026-05-03T13:44:04.203947716+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 510 |
+
{"time":"2026-05-03T13:44:18.62381564+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":500,"events_lines":2,"console_offset":175,"console_lines":1}
|
| 511 |
+
{"time":"2026-05-03T13:44:19.052290808+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 512 |
+
{"time":"2026-05-03T13:44:33.624777738+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":68,"history_lines":1,"events_offset":502,"events_lines":2,"console_offset":175,"console_lines":6}
|
| 513 |
+
{"time":"2026-05-03T13:44:34.104649872+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 514 |
+
{"time":"2026-05-03T13:44:48.625801712+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":69,"history_lines":1,"events_offset":504,"events_lines":2,"console_offset":175,"console_lines":2}
|
| 515 |
+
{"time":"2026-05-03T13:44:52.535011903+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 516 |
+
{"time":"2026-05-03T13:45:03.624781826+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":506,"events_lines":2,"console_offset":181,"console_lines":7}
|
| 517 |
+
{"time":"2026-05-03T13:45:06.161240364+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 518 |
+
{"time":"2026-05-03T13:45:18.624558731+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":508,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 519 |
+
{"time":"2026-05-03T13:45:19.159052248+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 520 |
+
{"time":"2026-05-03T13:45:33.624946738+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":510,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 521 |
+
{"time":"2026-05-03T13:45:34.109361541+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 522 |
+
{"time":"2026-05-03T13:45:48.623981354+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":512,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 523 |
+
{"time":"2026-05-03T13:45:53.77168815+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 524 |
+
{"time":"2026-05-03T13:46:03.624285248+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":70,"history_lines":1,"events_offset":514,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 525 |
+
{"time":"2026-05-03T13:46:04.21537653+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 526 |
+
{"time":"2026-05-03T13:46:18.624800855+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":516,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 527 |
+
{"time":"2026-05-03T13:46:19.063386972+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 528 |
+
{"time":"2026-05-03T13:46:33.624648964+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":518,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 529 |
+
{"time":"2026-05-03T13:46:34.118713729+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 530 |
+
{"time":"2026-05-03T13:46:48.624555173+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":520,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 531 |
+
{"time":"2026-05-03T13:46:52.650911087+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 532 |
+
{"time":"2026-05-03T13:47:03.624449509+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":522,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 533 |
+
{"time":"2026-05-03T13:47:04.222356182+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 534 |
+
{"time":"2026-05-03T13:47:18.625951064+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":71,"history_lines":1,"events_offset":524,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 535 |
+
{"time":"2026-05-03T13:47:19.071276729+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 536 |
+
{"time":"2026-05-03T13:47:33.624367816+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":526,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 537 |
+
{"time":"2026-05-03T13:47:34.124782785+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 538 |
+
{"time":"2026-05-03T13:47:48.624182215+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":528,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 539 |
+
{"time":"2026-05-03T13:47:53.784893271+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 540 |
+
{"time":"2026-05-03T13:48:03.624971356+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":530,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 541 |
+
{"time":"2026-05-03T13:48:04.238975318+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 542 |
+
{"time":"2026-05-03T13:48:18.623911134+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":532,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 543 |
+
{"time":"2026-05-03T13:48:19.078391518+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 544 |
+
{"time":"2026-05-03T13:48:33.624018702+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":534,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 545 |
+
{"time":"2026-05-03T13:48:34.188258053+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 546 |
+
{"time":"2026-05-03T13:48:48.624560208+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":72,"history_lines":1,"events_offset":536,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 547 |
+
{"time":"2026-05-03T13:48:54.918684542+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 548 |
+
{"time":"2026-05-03T13:49:03.624855319+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":538,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 549 |
+
{"time":"2026-05-03T13:49:04.128716752+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 550 |
+
{"time":"2026-05-03T13:49:18.624421818+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":540,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 551 |
+
{"time":"2026-05-03T13:49:19.394206735+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 552 |
+
{"time":"2026-05-03T13:49:33.624689788+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":542,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 553 |
+
{"time":"2026-05-03T13:49:34.136619823+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 554 |
+
{"time":"2026-05-03T13:49:48.624641418+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":544,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 555 |
+
{"time":"2026-05-03T13:49:52.571726058+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 556 |
+
{"time":"2026-05-03T13:50:03.624656992+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":546,"events_lines":2,"console_offset":186,"console_lines":1}
|
| 557 |
+
{"time":"2026-05-03T13:50:04.109888074+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 558 |
+
{"time":"2026-05-03T13:50:18.625313855+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":73,"history_lines":1,"events_offset":548,"events_lines":2,"console_offset":186,"console_lines":6}
|
| 559 |
+
{"time":"2026-05-03T13:50:19.195411994+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 560 |
+
{"time":"2026-05-03T13:50:33.624802417+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":550,"events_lines":2,"console_offset":187,"console_lines":1}
|
| 561 |
+
{"time":"2026-05-03T13:50:34.049371004+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 562 |
+
{"time":"2026-05-03T13:50:48.624139142+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":74,"history_lines":1,"events_offset":552,"events_lines":2,"console_offset":186,"console_lines":2}
|
| 563 |
+
{"time":"2026-05-03T13:50:52.574063905+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 564 |
+
{"time":"2026-05-03T13:51:03.625454368+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":554,"events_lines":2,"console_offset":192,"console_lines":7}
|
| 565 |
+
{"time":"2026-05-03T13:51:04.045280089+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 566 |
+
{"time":"2026-05-03T13:51:18.624804269+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":556,"events_lines":2,"console_offset":197,"console_lines":1}
|
| 567 |
+
{"time":"2026-05-03T13:51:19.059234035+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 568 |
+
{"time":"2026-05-03T13:51:33.62451164+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":558,"events_lines":2,"console_offset":197,"console_lines":1}
|
| 569 |
+
{"time":"2026-05-03T13:51:34.050683028+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 570 |
+
{"time":"2026-05-03T13:51:48.624511766+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":560,"events_lines":2,"console_offset":197,"console_lines":1}
|
| 571 |
+
{"time":"2026-05-03T13:51:52.584237864+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 572 |
+
{"time":"2026-05-03T13:52:03.453661092+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 573 |
+
{"time":"2026-05-03T13:52:03.455774675+05:30","level":"INFO","msg":"filestream: sending request","total_files":3,"history_offset":75,"history_lines":1,"console_offset":197,"console_lines":37,"uploaded_len":3,"complete":true,"exit_code":1}
|
| 574 |
+
{"time":"2026-05-03T13:52:03.938466542+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
|
| 575 |
+
{"time":"2026-05-03T13:52:03.939642046+05:30","level":"INFO","msg":"stream: finishing up"}
|
| 576 |
+
{"time":"2026-05-03T13:52:03.940171233+05:30","level":"INFO","msg":"handler: closed"}
|
| 577 |
+
{"time":"2026-05-03T13:52:03.943488073+05:30","level":"INFO","msg":"sender: closed"}
|
| 578 |
+
{"time":"2026-05-03T13:52:03.943913795+05:30","level":"INFO","msg":"stream: all finished"}
|
wandb/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-05-03 12:41:31,519 INFO MainThread:34388 [wandb_setup.py:_flush():81] Current SDK version is 0.26.1
|
| 2 |
+
2026-05-03 12:41:31,519 INFO MainThread:34388 [wandb_setup.py:_flush():81] Configure stats pid to 34388
|
| 3 |
+
2026-05-03 12:41:31,520 INFO MainThread:34388 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-05-03 12:41:31,520 INFO MainThread:34388 [wandb_init.py:setup_run_log_directory():723] Logging user logs to /run/media/morpheuslord/Personal_Files/Projects/Rewriter/wandb/run-20260503_124131-7q4dwe22/logs/debug.log
|
| 5 |
+
2026-05-03 12:41:31,520 INFO MainThread:34388 [wandb_init.py:setup_run_log_directory():724] Logging internal logs to /run/media/morpheuslord/Personal_Files/Projects/Rewriter/wandb/run-20260503_124131-7q4dwe22/logs/debug-internal.log
|
| 6 |
+
2026-05-03 12:41:31,520 INFO MainThread:34388 [wandb_init.py:init():850] calling init triggers
|
| 7 |
+
2026-05-03 12:41:31,520 INFO MainThread:34388 [wandb_init.py:init():855] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'model': {'key': 'flan-t5-small', 'quantize': False, 'use_lora': True}, 'lora': {'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'target_modules': ['q', 'v', 'k', 'o', 'wi_0', 'wi_1', 'wo']}, 'data': {'train_path': 'data/processed/train.jsonl', 'val_path': 'data/processed/val.jsonl', 'test_path': 'data/processed/test.jsonl', 'max_input_length': 128, 'max_target_length': 128, 'augment_synthetic': True, 'synthetic_ratio': 0.3}, 'training': {'output_dir': 'checkpoints/', 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.05, 'weight_decay': 0.01, 'fp16': False, 'bf16': True, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 3, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'logging_dir': 'logs/', 'logging_steps': 25, 'report_to': ['wandb', 'tensorboard'], 'dataloader_num_workers': 0, 'seed': 42, 'push_to_hub': False}, 'loss': {'lambda_style': 0.3, 'lambda_semantic': 0.5, 'lambda_human_pattern': 0.4, 'sem_model_name': 'all-mpnet-base-v2'}, 'generation': {'num_beams': 5, 'length_penalty': 1.0, 'no_repeat_ngram_size': 3, 'min_length': 10, 'max_new_tokens': 512, 'early_stopping': True}, 'human_pattern': {'classifier_path': 'checkpoints/human_pattern_classifier.pt', 'shanegerami_path': 'data/raw/shanegerami/AI_Human.csv', 'starblasters_path': 'data/raw/starblasters8/data.parquet', 'max_samples_per_source': 50000, 'pretrain_epochs': 20, 'pretrain_lr': 0.001, 'pretrain_batch_size': 512, 'target_auc': 0.88}, '_wandb': {}}
|
| 9 |
+
2026-05-03 12:41:31,520 INFO MainThread:34388 [wandb_init.py:init():898] starting backend
|
| 10 |
+
2026-05-03 12:41:31,902 INFO MainThread:34388 [wandb_init.py:init():913] sending inform_init request
|
| 11 |
+
2026-05-03 12:41:32,641 INFO MainThread:34388 [wandb_init.py:init():918] backend started and connected
|
| 12 |
+
2026-05-03 12:41:32,643 INFO MainThread:34388 [wandb_init.py:init():988] updated telemetry
|
| 13 |
+
2026-05-03 12:41:32,644 INFO MainThread:34388 [wandb_init.py:init():1011] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-05-03 12:41:33,463 INFO MainThread:34388 [wandb_init.py:init():1056] starting run threads in backend
|
| 15 |
+
2026-05-03 12:41:33,614 INFO MainThread:34388 [wandb_run.py:_console_start():2554] atexit reg
|
| 16 |
+
2026-05-03 12:41:33,614 INFO MainThread:34388 [wandb_run.py:_redirect():2403] redirect: wrap_raw
|
| 17 |
+
2026-05-03 12:41:33,614 INFO MainThread:34388 [wandb_run.py:_redirect():2472] Wrapping output streams.
|
| 18 |
+
2026-05-03 12:41:33,614 INFO MainThread:34388 [wandb_run.py:_redirect():2495] Redirects installed.
|
| 19 |
+
2026-05-03 12:41:33,616 INFO MainThread:34388 [wandb_init.py:init():1094] run started, returning control to user process
|
| 20 |
+
2026-05-03 12:41:39,987 INFO MainThread:34388 [wandb_run.py:_config_callback():1415] config_cb None None {'peft_config': {'default': {'task_type': 'SEQ_2_SEQ_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.19.1', 'base_model_name_or_path': 'google/flan-t5-small', 'revision': None, 'inference_mode': False, 'r': 8, 'target_modules': ['wo', 'wi_0', 'v', 'q', 'k', 'o', 'wi_1'], 'exclude_modules': None, 'lora_alpha': 16, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'lora_ga_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'use_bdlora': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 32128, 'd_model': 512, 'd_kv': 64, 'd_ff': 1024, 'num_layers': 8, 'num_decoder_layers': 8, 'num_heads': 6, 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'dropout_rate': 0.1, 'classifier_dropout': 0.0, 'layer_norm_epsilon': 1e-06, 'initializer_factor': 1.0, 'feed_forward_proj': 'gated-gelu', 'use_cache': True, 'dense_act_fn': 'gelu_new', 'is_gated_act': True, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['T5ForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': 0, 'task_specific_params': {'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}, 'problem_type': None, '_name_or_path': 'google/flan-t5-small', 'transformers_version': '4.53.2', 'model_type': 't5', 'n_positions': 512, 'output_past': True, 'output_attentions': False, 'output_dir': 'checkpoints/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'logs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 3, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'checkpoints/', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb', 'tensorboard'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
|
| 21 |
+
2026-05-03 12:41:39,991 INFO MainThread:34388 [wandb_config.py:__setitem__():155] [no run ID] config set model/num_parameters = 78239104 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f6f34fb9e80>>
|
| 22 |
+
2026-05-03 12:41:39,992 INFO MainThread:34388 [wandb_run.py:_config_callback():1415] config_cb model/num_parameters 78239104 None
|
| 23 |
+
2026-05-03 13:52:01,572 INFO wandb-AsyncioManager-main:34388 [service_client.py:_forward_responses():134] Reached EOF.
|
| 24 |
+
2026-05-03 13:52:01,575 INFO wandb-AsyncioManager-main:34388 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.
|
wandb/run-20260502_165105-pwnhqrrf/run-pwnhqrrf.wandb
ADDED
|
Binary file (11.5 kB). View file
|
|
|
wandb/run-20260502_165541-4d797dih/run-4d797dih.wandb
ADDED
|
Binary file (6.88 kB). View file
|
|
|
wandb/run-20260502_201947-ngpyijum/run-ngpyijum.wandb
ADDED
|
Binary file (30.8 kB). View file
|
|
|
wandb/run-20260502_202439-7n7pnref/run-7n7pnref.wandb
ADDED
|
Binary file (26.2 kB). View file
|
|
|
wandb/run-20260502_203519-fib23yhh/run-fib23yhh.wandb
ADDED
|
Binary file (32.8 kB). View file
|
|
|
wandb/run-20260502_210534-j0t4q38m/run-j0t4q38m.wandb
ADDED
|
Binary file (45.4 kB). View file
|
|
|
wandb/run-20260502_212127-vl8pftkj/run-vl8pftkj.wandb
ADDED
|
Binary file (65.8 kB). View file
|
|
|
wandb/run-20260502_213822-mmm9bdu9/run-mmm9bdu9.wandb
ADDED
|
Binary file (54.7 kB). View file
|
|
|
wandb/run-20260503_120130-xzkygl93/run-xzkygl93.wandb
ADDED
|
Binary file (4.92 kB). View file
|
|
|
wandb/run-20260503_120403-cbb6slr5/run-cbb6slr5.wandb
ADDED
|
Binary file (64.9 kB). View file
|
|
|