morpheuslord commited on 8 days ago

Commit

3df5819

verified ·

1 Parent(s): cb362ad

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.gitattributes +2 -32
.gitignore +50 -0
.train_stage1_done +0 -0
Dockerfile +29 -0
Plan.MD +0 -0
README.md +469 -3
configs/awl_config.yaml +13 -0
configs/inference_config.yaml +32 -0
configs/model_config.yaml +35 -0
configs/training_config.yaml +70 -0
configs/training_config_fast.yaml +82 -0
docker-compose.yml +21 -0
graph_codebase.py +859 -0
graphify-out/GRAPH_REPORT.md +252 -0
graphify-out/cost.json +36 -0
graphify-out/graph.html +0 -0
graphify-out/graph.json +0 -0
graphify-out/manifest.json +444 -0
pyproject.toml +42 -0
requirements-dev.txt +9 -0
requirements.txt +59 -0
scripts/download_all_huggingface_datasets.py +61 -0
scripts/download_datasets.sh +31 -0
scripts/download_kaggle_datasets.sh +41 -0
scripts/evaluate.py +85 -0
scripts/preprocess_data.py +206 -0
scripts/pretrain_human_pattern_classifier.py +201 -0
scripts/run_inference.py +59 -0
scripts/train.py +390 -0
src/__init__.py +0 -0
start.sh +123 -0
tests/test_evaluation.py +46 -0
tests/test_model.py +44 -0
tests/test_preprocessing.py +82 -0
tests/test_style.py +47 -0
tests/test_vocabulary.py +38 -0
todo_registry.md +335 -0
train.sh +215 -0
wandb/debug-internal.log +578 -0
wandb/debug.log +24 -0
wandb/run-20260502_165105-pwnhqrrf/run-pwnhqrrf.wandb +0 -0
wandb/run-20260502_165541-4d797dih/run-4d797dih.wandb +0 -0
wandb/run-20260502_201947-ngpyijum/run-ngpyijum.wandb +0 -0
wandb/run-20260502_202439-7n7pnref/run-7n7pnref.wandb +0 -0
wandb/run-20260502_203519-fib23yhh/run-fib23yhh.wandb +0 -0
wandb/run-20260502_210534-j0t4q38m/run-j0t4q38m.wandb +0 -0
wandb/run-20260502_212127-vl8pftkj/run-vl8pftkj.wandb +0 -0
wandb/run-20260502_213822-mmm9bdu9/run-mmm9bdu9.wandb +0 -0
wandb/run-20260503_120130-xzkygl93/run-xzkygl93.wandb +0 -0
wandb/run-20260503_120403-cbb6slr5/run-cbb6slr5.wandb +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.bin filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+checkpoints/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,50 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+dist/
+build/
+*.egg
+# Virtual environment
+venv/
+.venv/
+env/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Data (large files)
+data/raw/
+data/processed/
+!data/awl/
+# Model checkpoints
+checkpoints/
+*.pt
+*.pth
+*.bin
+*.safetensors
+# Logs
+logs/
+wandb/
+*.log
+# OS
+.DS_Store
+Thumbs.db
+# Jupyter
+.ipynb_checkpoints/
+# Environment
+.env
+*.env
+# Docker
+.dockerignore

.train_stage1_done ADDED Viewed

File without changes

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM python:3.10-slim
+# System dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    curl \
+    default-jre \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Download spaCy model
+RUN python -m spacy download en_core_web_trf
+# Download NLTK data
+RUN python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"
+# Copy application
+COPY . .
+# Expose API port
+EXPOSE 8000
+# Default: run the API server
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

Plan.MD ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,3 +1,469 @@
----
-license: apache-2.0
----

+---
+language:
+- en
+tags:
+- text2text-generation
+- dyslexia
+- grammar-correction
+- style-preservation
+- lora
+- flan-t5
+license: mit
+base_model: google/flan-t5-small
+datasets:
+- cambridge/fce
+- wi_locness
+- jfleg
+pipeline_tag: translation
+---
+# Dyslexia Academic Writing Correction System
+> **A style-preserving, grammar-correcting, academic vocabulary elevating AI system that corrects dyslectic writing while maintaining the author's personal voice, tone, and authorship signal — not a rewriter, a corrector.**
+## Overview
+This system takes text written by dyslexic students and corrects grammar, spelling, and fluency errors while:
+1. **Preserving the author's unique writing style** via a 512-dimensional style fingerprint vector
+2. **Elevating vocabulary to academic register** using Coxhead's Academic Word List (AWL) and BERT-based lexical substitution
+3. **Resisting AI detection** through a frozen Human Pattern Classifier that penalises AI-typical writing during training
+4. **Maintaining semantic meaning** with cosine-similarity-based semantic preservation loss
+The core model is **Google Flan-T5-Small** fine-tuned with **LoRA** (Low-Rank Adaptation), trained on real learner error corpora (FCE, W&I+LOCNESS, JFLEG) augmented with synthetic dyslexia-simulated data.
+---
+## Features
+| Feature | Description |
+|---------|-------------|
+| **Two-pass spell correction** | Dyslexia-aware phonetic pattern handling via LanguageTool |
+| **Style fingerprinting** | 41 raw features → MLP → 512-dim L2-normalised style vector |
+| **LoRA fine-tuning** | 1.63% trainable params (1.28M / 78.2M total), rank=8 |
+| **Academic vocabulary elevation** | BERT fill-mask → AWL candidate filtering → semantic similarity gate |
+| **Human pattern anti-AI loss** | Pre-trained frozen MLP classifier (17-dim features including GPT-2 perplexity) |
+| **Combined training loss** | `L_CE + λ₁·L_style + λ₂·L_semantic + λ₃·L_human_pattern` |
+| **Sentence-chunked inference** | Long texts split into 128-token chunks matching training window |
+| **FastAPI server** | RESTful `/correct` endpoint with CORS and rate limiting |
+| **Multi-stage training** | Orchestrated via `train.sh` with checkpoint system (Skip/Redo/Continue) |
+| **Synthetic data augmentation** | `DyslexiaSimulator` generates realistic errors from clean text |
+---
+## Project Structure
+```
+Rewriter/
+├── configs/
+│   ├── training_config.yaml        # Full training hyperparameters
+│   ├── training_config_fast.yaml   # Quick iteration config
+│   ├── inference_config.yaml       # Inference & generation settings
+│   ├── model_config.yaml           # Model architecture registry
+│   └── awl_config.yaml             # Academic Word List settings
+├── scripts/
+│   ├── train.py                    # Main training script (Click CLI)
+│   ├── evaluate.py                 # Test set evaluation (GLEU, ERRANT, BERTScore)
+│   ├── run_inference.py            # Interactive CLI inference
+│   ├── preprocess_data.py          # Raw datasets → unified JSONL
+│   ├── pretrain_human_pattern_classifier.py  # Stage 3: anti-AI classifier
+│   ├── download_datasets.sh        # BEA-2019 dataset downloader
+│   └── download_kaggle_datasets.sh # Kaggle human/AI data downloader
+├── src/
+│   ├── model/
+│   │   ├── base_model.py           # Model loader (T5/BART/Llama + LoRA + quantization)
+│   │   ├── style_conditioner.py    # Prefix tuning: style → virtual tokens
+│   │   ├── generation_utils.py     # Beam search, sampling, batch generation
+│   │   └── lora_adapter.py         # LoRA configuration helpers
+│   ├── preprocessing/
+│   │   ├── pipeline.py             # Full preprocessing orchestrator
+│   │   ├── spell_corrector.py      # LanguageTool + dyslexia-aware correction
+│   │   ├── dyslexia_simulator.py   # Synthetic error generation (Rello et al.)
+│   │   ├── dependency_parser.py    # spaCy dependency tree analysis
+│   │   ├── ner_tagger.py           # Named entity protection
+│   │   └── sentence_segmenter.py   # Sentence boundary detection
+│   ├── style/
+│   │   ├── fingerprinter.py        # 41 features → 512-dim style vector
+│   │   ├── style_vector.py         # Style vector dataclass
+│   │   ├── formality_classifier.py # Rule-based formality scoring
+│   │   └── emotion_classifier.py   # Emotion detection
+│   ├── training/
+│   │   ├── dataset.py              # Pre-tokenized cached dataset with style vectors
+│   │   ├── trainer.py              # CorrectionTrainer (HF Trainer + PEFT fixes)
+│   │   ├── loss_functions.py       # V1 and V2 combined losses
+│   │   ├── human_pattern_extractor.py  # 17-dim feature extraction + classifier
+│   │   └─�� callbacks.py            # Evaluation logging callbacks
+│   ├── vocabulary/
+│   │   ├── lexical_substitution.py # BERT fill-mask → AWL substitution pipeline
+│   │   ├── awl_loader.py           # Coxhead Academic Word List loader
+│   │   └── register_filter.py      # Contraction expansion + colloquial replacement
+│   ├── inference/
+│   │   ├── corrector.py            # End-to-end inference pipeline orchestrator
+│   │   └── postprocessor.py        # Cleanup, entity restore, formatting
+│   ├── evaluation/
+│   │   ├── gleu_scorer.py          # GLEU + BERTScore computation
+│   │   ├── errant_evaluator.py     # ERRANT P/R/F0.5 evaluation
+│   │   ├── style_metrics.py        # Style similarity + AWL coverage
+│   │   └── authorship_verifier.py  # AI detection resistance testing
+│   └── api/
+│       ├── main.py                 # FastAPI application
+│       ├── schemas.py              # Pydantic request/response models
+│       └── middleware.py           # Rate limiting + CORS
+├── data/
+│   ├── raw/                        # Original datasets (FCE, W&I+LOCNESS, JFLEG, Kaggle)
+│   ├── processed/                  # Unified JSONL (train/val/test splits)
+│   ├── cache/                      # Pre-tokenized dataset caches (.pt files)
+│   └── awl/                        # Coxhead Academic Word List
+├── train.sh                        # Multi-stage training orchestrator
+├── start.sh                        # Inference launcher (CLI or API mode)
+├── Dockerfile                      # Production container
+├── docker-compose.yml              # Docker deployment
+├── requirements.txt                # Python dependencies
+└── pyproject.toml                  # Project metadata
+```
+---
+## Design Choices & Rationale
+### Why Flan-T5-Small?
+| Consideration | Decision |
+|---------------|----------|
+| **Hardware constraint** | RTX 3050 Laptop GPU (4GB VRAM) — rules out models > 500M params |
+| **Architecture** | Encoder-decoder (seq2seq) is ideal for text-to-text correction tasks |
+| **Instruction tuning** | Flan-T5 is pre-trained on 1,800+ instruction tasks — follows correction prompts naturally |
+| **LoRA efficiency** | Only 1.28M trainable params (1.63%) — fits in 4GB with batch_size=4 + bf16 |
+### Why LoRA over Full Fine-Tuning?
+- **Memory**: Full fine-tuning of T5-Small requires ~2.5GB for gradients alone; LoRA needs ~200MB
+- **Speed**: LoRA converges in 5 epochs (~1,515 steps) on a single RTX 3050
+- **Merging**: LoRA weights merge into base model at inference time — zero latency overhead
+- **Configuration**: `r=8, alpha=16, dropout=0.05`, targeting all attention + FFN projections (`q, k, v, o, wi_0, wi_1, wo`)
+### Why a Combined Multi-Objective Loss?
+The system uses a 4-term loss function: `L = L_CE + 0.3·L_style + 0.5·L_semantic + 0.4·L_human`
+| Term | Purpose | Weight |
+|------|---------|--------|
+| `L_CE` | Standard cross-entropy token prediction | 1.0 |
+| `L_style` | `1 - cos_sim(output_style, input_style)` — preserves writing fingerprint | 0.3 |
+| `L_semantic` | `1 - cos_sim(input_embedding, output_embedding)` — preserves meaning | 0.5 |
+| `L_human` | `1 - HumanPatternClassifier(output)` — penalises AI-like text patterns | 0.4 |
+**Why these weights?** Style and human-pattern losses are auxiliary signals — too high and they override grammar correction. The semantic loss is weighted highest (0.5) because meaning preservation is the hardest constraint to satisfy.
+### Why a Human Pattern Classifier?
+AI-generated text has detectable statistical signatures:
+- **Lower GPT-2 perplexity** (AI text is more "predictable")
+- **Lower burstiness** (AI has uniform sentence lengths; humans vary)
+- **Higher AI marker density** (overuse of "delve", "leverage", "furthermore")
+- **Lower n-gram novelty** (AI reuses phrases more)
+The classifier is a 3-layer MLP (17→128→64→1) pre-trained on ~100k samples from two Kaggle datasets (Shanegerami AI_Human.csv + Starblasters8), then **frozen** during main training. Its output score (0=AI, 1=human) is used as a reward signal.
+### Why Sentence-Chunked Inference?
+The model was trained with `max_input_length=128` tokens. The task prefix alone consumes ~40 tokens, leaving ~86 tokens for actual text. Long inputs are:
+1. Split into sentences using spaCy
+2. Grouped into chunks that fit the 128-token budget
+3. Each chunk is corrected independently
+4. Results are joined back together
+This prevents the model from seeing out-of-distribution input lengths and avoids truncation artifacts.
+### Why Post-Generation Vocabulary Elevation?
+Rather than relying solely on the model to produce academic vocabulary (which T5-Small lacks the capacity for), we apply a separate **BERT-based lexical substitution** pipeline:
+1. POS-tag the output with spaCy
+2. Identify non-AWL content words (nouns, verbs, adjectives, adverbs)
+3. Mask each candidate → run BERT fill-mask → filter to AWL-only predictions
+4. Accept substitution only if `semantic_similarity > 0.82` (measured with `all-mpnet-base-v2`)
+5. Track used substitutions to prevent duplicate replacements
+---
+## Quick Start
+### Prerequisites
+- Python ≥ 3.10
+- NVIDIA GPU with ≥ 4GB VRAM (or CPU, slower)
+- ~10GB disk space for models and datasets
+### Option A: Automated Training Pipeline
+```bash
+# Clone and setup
+git clone https://huggingface.co/morpheuslord/rewriter && cd rewriter
+pip install -r requirements.txt
+# Set W&B key (optional, for experiment tracking)
+export WANDB_API_KEY="your-key-here"
+# Run the full 5-stage pipeline
+bash train.sh
+```
+The orchestrator handles: **Setup → Preprocessing → Human Pattern Pre-training → Model Training → Evaluation**
+Each stage has a checkpoint system — if interrupted, re-run `train.sh` and select `[S]kip` for completed stages.
+### Option B: Manual Step-by-Step
+```bash
+# 1. Install dependencies
+pip install -r requirements.txt
+python -m spacy download en_core_web_sm
+# 2. Preprocess datasets (FCE, W&I+LOCNESS, JFLEG → unified JSONL)
+python scripts/preprocess_data.py
+# 3. Pre-train the human pattern classifier
+python scripts/pretrain_human_pattern_classifier.py
+# 4. Train the correction model
+PYTHONPATH=. python scripts/train.py --config configs/training_config.yaml --use-v2-loss
+# 5. Merge LoRA adapter into base model for inference
+python -c "
+from peft import PeftModel
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import torch
+model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small', torch_dtype=torch.bfloat16)
+model = PeftModel.from_pretrained(model, 'checkpoints/checkpoint-BEST')
+model = model.merge_and_unload()
+model.save_pretrained('checkpoints/best_model_merged')
+AutoTokenizer.from_pretrained('google/flan-t5-small').save_pretrained('checkpoints/best_model_merged')
+"
+# 6. Run inference
+PYTHONPATH=. python scripts/run_inference.py --text "The studnet recieved alot of informtion."
+# 7. Or start the API server
+PYTHONPATH=. python -m uvicorn src.api.main:app --host 0.0.0.0 --port 8000
+```
+---
+## Training Pipeline (5 Stages)
+### Stage 1: Setup & Dependencies
+Installs Python packages, downloads spaCy models (`en_core_web_sm`), and NLTK tokenizers.
+### Stage 2: Data Preprocessing
+Converts raw datasets into unified JSONL format:
+| Dataset | Source | Format | Pairs |
+|---------|--------|--------|-------|
+| **FCE v2.1** | BEA-2019 Shared Task | Character-level edits | ~28k |
+| **W&I+LOCNESS v2.1** | BEA-2019 Shared Task | Character-level edits | ~34k |
+| **JFLEG** | Johns Hopkins | 4 reference corrections per source | ~5k |
+Output schema: `{"input": "erroneous text", "target": "corrected text", "source": "fce|wi_locness|jfleg"}`
+Split: 90% train / 10% validation (with 50% of validation used as test, capped at 500).
+### Stage 3: Human Pattern Classifier Pre-Training
+Trains a frozen binary MLP classifier on ~100k human vs AI text samples. Uses 17 features:
+```
+[perplexity, burstiness, sentence_starter_diversity,
+ bigram_novelty, trigram_novelty, 4gram_novelty,
+ ai_marker_density, overused_discourse_density,
+ em_dash_rate, ellipsis_rate, comma_rate, semicolon_rate,
+ word_count, sentence_count, mean_sent_length, std_sent_length, ttr]
+```
+GPT-2 perplexity is computed in batched GPU forward passes. Text features are extracted in parallel via `ProcessPoolExecutor`.
+### Stage 4: Main Model Training
+Fine-tunes Flan-T5-Small with LoRA using the V2 combined loss. Key hyperparameters:
+| Parameter | Value |
+|-----------|-------|
+| Effective batch size | 32 (4 × 8 gradient accumulation) |
+| Learning rate | 3e-4 (cosine schedule, 5% warmup) |
+| Precision | bf16 (Ampere+ GPUs) |
+| Max input tokens | 128 |
+| Max target tokens | 128 |
+| Epochs | 5 |
+| Eval/Save interval | Every 100 steps |
+### Stage 5: Evaluation
+Runs on test set with metrics: GLEU, BERTScore F1, ERRANT F0.5, Style Similarity, AWL Coverage.
+---
+## Inference Pipeline (7 Steps)
+```
+Raw Text
+  │
+  ▼
+1. Preprocessing ─────── LanguageTool spell correction + spaCy parsing
+  │
+  ▼
+2. Style Fingerprinting ─ Extract 41 features → MLP → 512-dim vector
+  │
+  ▼
+3. Sentence-Chunked Generation ─ Split into 128-token chunks → Flan-T5 → rejoin
+  │
+  ▼
+4. Post-Processing ───── Remove artifacts, replace em dashes, fix spacing
+  │
+  ▼
+5. Vocabulary Elevation ─ BERT fill-mask → AWL filtering → semantic gate
+  │
+  ▼
+6. Register Filtering ── Expand contractions, replace colloquialisms
+  │
+  ▼
+7. Metrics ──────────── Style similarity, AWL coverage, readability scores
+  │
+  ▼
+Corrected Text
+```
+---
+## Configuration Reference
+### `configs/training_config.yaml`
+```yaml
+model:
+  key: "flan-t5-small"          # flan-t5-xl | flan-t5-large | flan-t5-base | flan-t5-small
+  quantize: false               # 4-bit NF4 quantization (needs GPU)
+  use_lora: true                # Parameter-efficient fine-tuning
+lora:
+  r: 8                          # LoRA rank (higher = more capacity, more VRAM)
+  lora_alpha: 16                # Scaling factor (usually 2×r)
+  lora_dropout: 0.05            # Regularisation
+  target_modules: [q, v, k, o, wi_0, wi_1, wo]  # All attention + FFN layers
+training:
+  per_device_train_batch_size: 4
+  gradient_accumulation_steps: 8  # Effective batch = 32
+  learning_rate: 3.0e-4
+  lr_scheduler_type: cosine
+  bf16: true                      # Use bfloat16 on Ampere+ GPUs
+loss:
+  lambda_style: 0.3              # Style preservation weight
+  lambda_semantic: 0.5           # Meaning preservation weight
+  lambda_human_pattern: 0.4      # Anti-AI penalty weight
+```
+### `configs/inference_config.yaml`
+```yaml
+model:
+  key: "flan-t5-small"
+  checkpoint_path: "checkpoints/best_model_merged"
+  use_lora: false                # Merged model — no adapter needed
+generation:
+  num_beams: 5                   # Beam search width
+  length_penalty: 1.2            # > 1.0 rewards longer outputs
+  no_repeat_ngram_size: 3        # Prevents repetition
+  max_new_tokens: 128            # Must match training max_target_length
+vocabulary:
+  semantic_threshold: 0.82       # Minimum cosine similarity for AWL substitution
+```
+---
+## API Usage
+```bash
+# Start the server
+PYTHONPATH=. python -m uvicorn src.api.main:app --host 0.0.0.0 --port 8000
+# Correct text
+curl -X POST http://localhost:8000/correct \
+  -H "Content-Type: application/json" \
+  -d '{"text": "The studnet recieved alot of informtion.", "style_alpha": 0.6}'
+# Health check
+curl http://localhost:8000/health
+```
+Interactive docs available at `http://localhost:8000/docs`.
+---
+## Hardware Requirements
+| Tier | GPU | Model | Training Time |
+|------|-----|-------|---------------|
+| **Tested** | RTX 3050 4GB | Flan-T5-Small + LoRA | ~45 min (5 epochs) |
+| Recommended | RTX 3090 24GB | Flan-T5-Base + LoRA | ~2h |
+| Maximum | A100 80GB | Flan-T5-XL + LoRA | ~12h |
+CPU inference is supported but significantly slower (~30s per correction vs ~2s on GPU).
+---
+## Data Sources
+| Dataset | Type | Size | Source |
+|---------|------|------|--------|
+| FCE v2.1 | Learner errors + corrections | ~28k pairs | Cambridge English |
+| W&I+LOCNESS v2.1 | Learner errors + corrections | ~34k pairs | BEA-2019 Shared Task |
+| JFLEG | Fluency corrections (4 refs) | ~5k pairs | Johns Hopkins |
+| Shanegerami AI_Human.csv | Human vs AI classification | ~50k samples | Kaggle |
+| Starblasters8 data.parquet | Human vs AI classification | ~50k samples | Kaggle |
+| Coxhead AWL | Academic Word List | 570 families / 549 headwords | Victoria University |
+---
+## Dyslexia Error Simulation
+The `DyslexiaSimulator` generates synthetic training data based on research by Rello et al. (2013, 2017):
+| Error Type | Frequency | Example |
+|-----------|-----------|---------|
+| Phonetic substitution | 35% | "because" → "becaus" |
+| Letter transposition | 18% | "the" → "teh" |
+| Letter omission | 16% | "important" → "importnt" |
+| Letter doubling | 12% | "letter" → "lettter" |
+| Letter reversal (b/d, p/q) | 10% | "bad" → "dad" |
+| Word boundary errors | 9% | "a lot" → "alot" |
+---
+## Style Fingerprint Vector
+The 512-dimensional style vector captures 41 raw features:
+| Group | Features | Count |
+|-------|----------|-------|
+| Sentence stats | mean, std, skew of sentence lengths | 3 |
+| Word stats | mean, std of word lengths | 2 |
+| Lexical | type-token ratio, lexical density | 2 |
+| Syntactic | passive/active voice ratio, subordinate clause ratio, avg dependency tree depth | 4 |
+| Discourse | 20 academic discourse markers (per 100 words) | 20 |
+| Register | hedging frequency, formality score, nominalization ratio | 3 |
+| Readability | Flesch reading ease, avg syllables per word | 2 |
+| Pronouns | first-person ratio, third-person ratio | 2 |
+| Other | question ratio, exclamation ratio, AWL coverage | 3 |
+These are projected through a 2-layer MLP (`41 → 256 → 512`) with LayerNorm and GELU activation, then L2-normalised.
+---
+## Known Limitations
+1. **Model capacity**: Flan-T5-Small (77M params) has limited correction ability compared to larger models
+2. **Training window**: 128-token max input means very long sentences may be split mid-clause
+3. **Vocabulary elevation**: BERT fill-mask can suggest semantically inappropriate AWL words; the similarity threshold (0.82) is a trade-off between coverage and accuracy
+4. **Already-correct text**: The model is trained on error→correction pairs; feeding it clean text produces unpredictable output
+5. **LanguageTool latency**: Spell correction takes ~15-20s due to JVM startup on first call

configs/awl_config.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+awl:
+  primary: "data/awl/coxhead_awl.txt"
+  supplementary:
+    - "data/awl/domain_lexicons/humanities.txt"
+    - "data/awl/domain_lexicons/sciences.txt"
+    - "data/awl/domain_lexicons/social_sciences.txt"
+  academic_synonyms: "data/awl/academic_synonyms.json"
+register:
+  expand_contractions: true
+  replace_colloquialisms: true
+  enforce_third_person_academic: false  # Keep user's voice (don't force "one")
+  minimum_formality_score: 0.65

configs/inference_config.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+model:
+  key: "flan-t5-small"
+  checkpoint_path: "checkpoints/best_model_merged"
+  quantize: false
+  use_lora: false                 # Merged model — no adapter needed
+  model_hidden_dim: 512
+style_conditioner:
+  style_dim: 512
+  n_prefix_tokens: 10
+generation:
+  num_beams: 5
+  length_penalty: 1.2
+  no_repeat_ngram_size: 3
+  min_length: 5
+  max_new_tokens: 128
+  early_stopping: true
+  temperature: 0.7
+  do_sample: false
+vocabulary:
+  awl_path: "data/awl/coxhead_awl.txt"
+  mlm_model: "bert-large-uncased"
+  sem_model: "all-mpnet-base-v2"
+  semantic_threshold: 0.82
+api:
+  host: "0.0.0.0"
+  port: 8000
+  workers: 1
+  reload: false

configs/model_config.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+model:
+  key: "flan-t5-xl"
+  checkpoint_path: "checkpoints/best_model"
+  quantize: false
+  use_lora: true
+  model_hidden_dim: 2048      # flan-t5-xl hidden size
+  # model_hidden_dim: 1024    # flan-t5-large
+  # model_hidden_dim: 1024    # bart-large
+  # model_hidden_dim: 4096    # llama-3.1-8b
+style_conditioner:
+  style_dim: 512
+  n_prefix_tokens: 10
+fingerprinter:
+  spacy_model: "en_core_web_trf"
+  awl_path: "data/awl/coxhead_awl.txt"
+  projection_hidden_dim: 256
+  projection_output_dim: 512
+generation:
+  num_beams: 5
+  length_penalty: 1.0
+  no_repeat_ngram_size: 3
+  min_length: 10
+  max_new_tokens: 512
+  early_stopping: true
+  temperature: 0.7             # Slight randomness for naturalness
+  do_sample: false             # Beam search by default
+vocabulary:
+  awl_path: "data/awl/coxhead_awl.txt"
+  mlm_model: "bert-large-uncased"
+  sem_model: "all-mpnet-base-v2"
+  semantic_threshold: 0.82

configs/training_config.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+model:
+  key: "flan-t5-small"            # flan-t5-xl | flan-t5-large | flan-t5-base | flan-t5-small | bart-large | llama-3.1-8b
+  quantize: false
+  use_lora: true
+lora:
+  r: 8
+  lora_alpha: 16
+  lora_dropout: 0.05
+  target_modules: ["q", "v", "k", "o", "wi_0", "wi_1", "wo"]
+data:
+  train_path: "data/processed/train.jsonl"
+  val_path: "data/processed/val.jsonl"
+  test_path: "data/processed/test.jsonl"
+  max_input_length: 128
+  max_target_length: 128
+  augment_synthetic: true
+  synthetic_ratio: 0.3
+training:
+  output_dir: "checkpoints/"
+  num_train_epochs: 5
+  per_device_train_batch_size: 4   # T5-Small in bf16 fits batch=4 in 4GB VRAM
+  per_device_eval_batch_size: 8
+  gradient_accumulation_steps: 8   # Effective batch = 4*8 = 32
+  learning_rate: 3.0e-4
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  weight_decay: 0.01
+  fp16: false
+  bf16: true                        # Use bfloat16 on Ampere+ GPUs
+  evaluation_strategy: "steps"
+  eval_steps: 100
+  save_strategy: "steps"
+  save_steps: 100
+  save_total_limit: 3
+  load_best_model_at_end: true
+  metric_for_best_model: "eval_loss"
+  greater_is_better: false
+  logging_dir: "logs/"
+  logging_steps: 25
+  report_to: ["wandb", "tensorboard"]
+  dataloader_num_workers: 0  # Python 3.14 forkserver breaks with workers > 0
+  seed: 42
+  push_to_hub: false
+loss:
+  lambda_style: 0.3
+  lambda_semantic: 0.5
+  lambda_human_pattern: 0.4       # Human pattern reward weight
+  sem_model_name: "all-mpnet-base-v2"
+generation:
+  num_beams: 5
+  length_penalty: 1.0
+  no_repeat_ngram_size: 3
+  min_length: 10
+  max_new_tokens: 512
+  early_stopping: true
+human_pattern:
+  classifier_path: "checkpoints/human_pattern_classifier.pt"
+  shanegerami_path: "data/raw/shanegerami/AI_Human.csv"
+  starblasters_path: "data/raw/starblasters8/data.parquet"
+  max_samples_per_source: 50000
+  pretrain_epochs: 20
+  pretrain_lr: 1.0e-3
+  pretrain_batch_size: 512
+  target_auc: 0.88

configs/training_config_fast.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+# ═══════════════════════════════════════════════════════════════════════════
+# training_config_fast.yaml — Optimised for RTX 3050 (4GB) + T5-Small
+# ═══════════════════════════════════════════════════════════════════════════
+# Usage: python scripts/train.py --config configs/training_config_fast.yaml
+#
+# Compared to training_config.yaml, this config:
+#   - Uses T5-Small (77M params vs 248M for Base)
+#   - Lower LoRA rank for fewer trainable params
+#   - Larger batch + less accumulation for throughput
+#   - More epochs since each is fast
+#   - More frequent logging/eval for tighter feedback loop
+model:
+  key: "flan-t5-small"
+  quantize: false
+  use_lora: true
+lora:
+  r: 8
+  lora_alpha: 16
+  lora_dropout: 0.05
+  target_modules: ["q", "v", "k", "o", "wi_0", "wi_1", "wo"]
+data:
+  train_path: "data/processed/train.jsonl"
+  val_path: "data/processed/val.jsonl"
+  test_path: "data/processed/test.jsonl"
+  max_input_length: 128
+  max_target_length: 128
+  augment_synthetic: true
+  synthetic_ratio: 0.3
+training:
+  output_dir: "checkpoints/"
+  num_train_epochs: 5
+  per_device_train_batch_size: 4
+  per_device_eval_batch_size: 8
+  gradient_accumulation_steps: 8    # Effective batch = 32
+  learning_rate: 3.0e-4
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  weight_decay: 0.01
+  fp16: false
+  bf16: true
+  evaluation_strategy: "steps"
+  eval_steps: 100
+  save_strategy: "steps"
+  save_steps: 100
+  save_total_limit: 3
+  load_best_model_at_end: true
+  metric_for_best_model: "eval_loss"
+  greater_is_better: false
+  logging_dir: "logs/"
+  logging_steps: 25
+  report_to: ["tensorboard"]        # Skip W&B for max speed
+  dataloader_num_workers: 0  # Python 3.14 forkserver breaks with workers > 0
+  seed: 42
+  push_to_hub: false
+loss:
+  lambda_style: 0.3
+  lambda_semantic: 0.5
+  lambda_human_pattern: 0.4
+  sem_model_name: "all-mpnet-base-v2"
+generation:
+  num_beams: 5
+  length_penalty: 1.0
+  no_repeat_ngram_size: 3
+  min_length: 10
+  max_new_tokens: 512
+  early_stopping: true
+human_pattern:
+  classifier_path: "checkpoints/human_pattern_classifier.pt"
+  shanegerami_path: "data/raw/shanegerami/AI_Human.csv"
+  starblasters_path: "data/raw/starblasters8/data.parquet"
+  max_samples_per_source: 50000
+  pretrain_epochs: 20
+  pretrain_lr: 1.0e-3
+  pretrain_batch_size: 512
+  target_auc: 0.88

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+version: "3.8"
+services:
+  api:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./configs:/app/configs
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped

graph_codebase.py ADDED Viewed

	@@ -0,0 +1,859 @@

+#!/usr/bin/env python3
+"""
+graphify_rebuild.py — One-shot NudR knowledge graph regeneration.
+Usage:
+    python graphify_rebuild.py           # Full rebuild
+    python graphify_rebuild.py --watch   # Watch mode (rebuilds on file change)
+    python graphify_rebuild.py --quick   # Skip semantic, AST-only rebuild
+Outputs (all in graphify-out/):
+    GRAPH_REPORT.md   — Full community/audit report
+    graph.html        — Interactive force-directed graph (open in browser)
+    graph.json        — Raw graph data for tooling
+    manifest.json     — File hashes for incremental re-runs
+    cost.json         — Token usage tracking
+"""
+import sys, io, os, json, ast, hashlib, time, argparse
+from pathlib import Path
+from datetime import datetime, timezone
+# Fix Windows console encoding
+if sys.platform == 'win32':
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
+# ─── Configuration ───────────────────────────────────────────────────────────
+ROOT         = Path(__file__).parent
+OUT_DIR      = ROOT / 'graphify-out'
+CACHE_DIR    = OUT_DIR / 'cache'
+MANIFEST     = OUT_DIR / 'manifest.json'
+REPORT_PATH  = OUT_DIR / 'GRAPH_REPORT.md'
+HTML_PATH    = OUT_DIR / 'graph.html'
+JSON_PATH    = OUT_DIR / 'graph.json'
+COST_PATH    = OUT_DIR / 'cost.json'
+# Directories and patterns to skip
+SKIP_DIRS = {
+    '.git', '.venv', 'venv', 'node_modules', '__pycache__', '.mypy_cache',
+    '.pytest_cache', '.graphify', 'graphify-out', '.terraform', '.idea',
+    'env', 'dist', 'build', 'egg-info', '.tox', '.ruff_cache',
+}
+SKIP_EXTENSIONS = {'.pyc', '.pyo', '.whl', '.egg', '.so', '.dll', '.exe'}
+# File types for AST extraction
+AST_EXTENSIONS = {'.py'}
+# File types for corpus (semantic awareness)
+CORPUS_EXTENSIONS = {
+    '.py', '.md', '.txt', '.html', '.css', '.js', '.ts', '.json',
+    '.yaml', '.yml', '.toml', '.cfg', '.ini', '.proto', '.tf', '.tfvars',
+}
+# ─── Step 1: Detect files ────────────────────────────────────────────────────
+def detect_files():
+    """Walk the project and return list of relevant files with metadata."""
+    files = []
+    total_words = 0
+    for dirpath, dirnames, filenames in os.walk(ROOT):
+        # Prune skipped directories
+        dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
+        for fname in filenames:
+            fpath = Path(dirpath) / fname
+            ext = fpath.suffix.lower()
+            if ext in SKIP_EXTENSIONS:
+                continue
+            rel = fpath.relative_to(ROOT)
+            if any(part.startswith('.') for part in rel.parts[:-1]):
+                continue
+            try:
+                mtime = fpath.stat().st_mtime
+                size = fpath.stat().st_size
+            except OSError:
+                continue
+            if ext in CORPUS_EXTENSIONS and size < 5_000_000:
+                try:
+                    content = fpath.read_text(encoding='utf-8', errors='ignore')
+                    word_count = len(content.split())
+                    total_words += word_count
+                except Exception:
+                    word_count = 0
+            else:
+                word_count = 0
+            files.append({
+                'path': str(rel),
+                'ext': ext,
+                'mtime': mtime,
+                'size': size,
+                'words': word_count,
+            })
+    return files, total_words
+def get_changed_files(files):
+    """Compare against manifest to find changed files."""
+    if MANIFEST.exists():
+        old_manifest = json.loads(MANIFEST.read_text(encoding='utf-8'))
+    else:
+        old_manifest = {}
+    changed = []
+    for f in files:
+        old_mtime = old_manifest.get(f['path'])
+        if old_mtime is None or f['mtime'] != old_mtime:
+            changed.append(f)
+    return changed
+# ─── Step 2: AST Extraction ──────────────────────────────────────────────────
+def hash_file(path):
+    """SHA-256 hash for cache keying."""
+    h = hashlib.sha256()
+    try:
+        h.update(Path(path).read_bytes())
+    except Exception:
+        h.update(path.encode())
+    return h.hexdigest()
+def extract_ast_file(filepath):
+    """Extract AST nodes and edges from a single Python file."""
+    nodes = []
+    edges = []
+    rel = str(filepath.relative_to(ROOT))
+    file_id = rel.replace('\\', '_').replace('/', '_').replace('.', '_')
+    try:
+        source = filepath.read_text(encoding='utf-8', errors='ignore')
+        tree = ast.parse(source, filename=str(filepath))
+    except SyntaxError:
+        return nodes, edges
+    # File-level node
+    nodes.append({
+        'id': file_id,
+        'label': filepath.name,
+        'file_type': 'code',
+        'source_file': rel,
+    })
+    # Extract module-level docstring
+    docstring = ast.get_docstring(tree)
+    if docstring and len(docstring) > 20:
+        doc_id = f"{file_id}_docstring"
+        nodes.append({
+            'id': doc_id,
+            'label': docstring[:80],
+            'file_type': 'rationale',
+            'source_file': rel,
+        })
+        edges.append({
+            'source': file_id, 'target': doc_id,
+            'relation': 'has_rationale',
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': rel, 'weight': 0.5,
+        })
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            func_id = f"{file_id}_{node.name}"
+            label = f"{node.name}()"
+            nodes.append({
+                'id': func_id,
+                'label': label,
+                'file_type': 'code',
+                'source_file': rel,
+                'source_location': f"line {node.lineno}",
+            })
+            edges.append({
+                'source': file_id, 'target': func_id,
+                'relation': 'defines',
+                'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                'source_file': rel, 'weight': 1.0,
+            })
+            # Function docstring
+            fdoc = ast.get_docstring(node)
+            if fdoc and len(fdoc) > 20:
+                fdoc_id = f"{func_id}_doc"
+                nodes.append({
+                    'id': fdoc_id,
+                    'label': fdoc[:80],
+                    'file_type': 'rationale',
+                    'source_file': rel,
+                    'source_location': f"line {node.lineno}",
+                })
+                edges.append({
+                    'source': func_id, 'target': fdoc_id,
+                    'relation': 'has_rationale',
+                    'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                    'source_file': rel, 'weight': 0.5,
+                })
+            # Calls inside functions
+            for child in ast.walk(node):
+                if isinstance(child, ast.Call):
+                    callee = _get_call_name(child)
+                    if callee:
+                        edges.append({
+                            'source': func_id,
+                            'target': callee,
+                            'relation': 'calls',
+                            'confidence': 'INFERRED', 'confidence_score': 0.7,
+                            'source_file': rel, 'weight': 0.8,
+                        })
+        elif isinstance(node, ast.ClassDef):
+            class_id = f"{file_id}_{node.name}"
+            nodes.append({
+                'id': class_id,
+                'label': node.name,
+                'file_type': 'code',
+                'source_file': rel,
+                'source_location': f"line {node.lineno}",
+            })
+            edges.append({
+                'source': file_id, 'target': class_id,
+                'relation': 'defines',
+                'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                'source_file': rel, 'weight': 1.0,
+            })
+            # Class docstring
+            cdoc = ast.get_docstring(node)
+            if cdoc and len(cdoc) > 20:
+                cdoc_id = f"{class_id}_doc"
+                nodes.append({
+                    'id': cdoc_id,
+                    'label': cdoc[:80],
+                    'file_type': 'rationale',
+                    'source_file': rel,
+                    'source_location': f"line {node.lineno}",
+                })
+                edges.append({
+                    'source': class_id, 'target': cdoc_id,
+                    'relation': 'has_rationale',
+                    'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                    'source_file': rel, 'weight': 0.5,
+                })
+            # Base classes
+            for base in node.bases:
+                base_name = _get_name(base)
+                if base_name:
+                    edges.append({
+                        'source': class_id, 'target': base_name,
+                        'relation': 'inherits',
+                        'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                        'source_file': rel, 'weight': 1.0,
+                    })
+        elif isinstance(node, ast.Import):
+            for alias in node.names:
+                edges.append({
+                    'source': file_id, 'target': alias.name,
+                    'relation': 'imports',
+                    'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                    'source_file': rel, 'weight': 0.6,
+                })
+        elif isinstance(node, ast.ImportFrom) and node.module:
+            edges.append({
+                'source': file_id, 'target': node.module,
+                'relation': 'imports',
+                'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+                'source_file': rel, 'weight': 0.6,
+            })
+    return nodes, edges
+def _get_call_name(node):
+    """Extract callable name from ast.Call node."""
+    if isinstance(node.func, ast.Name):
+        return node.func.id
+    elif isinstance(node.func, ast.Attribute):
+        return node.func.attr
+    return None
+def _get_name(node):
+    """Extract name from various AST node types."""
+    if isinstance(node, ast.Name):
+        return node.id
+    elif isinstance(node, ast.Attribute):
+        return node.attr
+    return None
+def _resolve_edges(all_nodes, all_edges):
+    """Post-process edges to resolve bare names to actual node IDs.
+    The per-file AST extraction produces edges with bare targets:
+      - calls: target='get_cached_image'  (bare function name)
+      - imports: target='app.core.session' (dotted module path)
+    This function resolves them to actual node IDs so they survive
+    the graph build phase (which drops unresolvable targets).
+    """
+    node_ids = {n['id'] for n in all_nodes}
+    # Build function name → [node_id, ...] index
+    func_index: dict[str, list[str]] = {}
+    for n in all_nodes:
+        if n.get('file_type') == 'code' and '(' in n.get('label', ''):
+            # label looks like "get_cached_image()"
+            bare_name = n['label'].rstrip('()')
+            func_index.setdefault(bare_name, []).append(n['id'])
+    # Build module path → file node ID map
+    # e.g. 'app.core.session' → 'app_core_session_py'
+    module_index: dict[str, str] = {}
+    for n in all_nodes:
+        src = n.get('source_file', '')
+        if src.endswith('.py'):
+            # Convert 'app/core/session.py' or 'app\core\session.py'
+            # → dotted module: 'app.core.session'
+            mod_path = src.replace('\\', '/').replace('/', '.').removesuffix('.py')
+            # Strip leading __init__ for package imports
+            mod_path_init = mod_path.removesuffix('.__init__')
+            nid = n['id']
+            # Only map file-level nodes (no functions/classes)
+            if nid == src.replace('\\', '_').replace('/', '_').replace('.', '_'):
+                module_index[mod_path] = nid
+                if mod_path != mod_path_init:
+                    module_index[mod_path_init] = nid
+    resolved_edges = []
+    calls_resolved = 0
+    imports_resolved = 0
+    dropped = 0
+    for edge in all_edges:
+        rel = edge.get('relation', '')
+        if rel == 'calls':
+            target = edge['target']
+            # Try exact match first
+            if target in node_ids:
+                resolved_edges.append(edge)
+                calls_resolved += 1
+                continue
+            # Resolve via function index
+            matches = func_index.get(target, [])
+            if matches:
+                for match_id in matches:
+                    # Don't create self-edges within the same file
+                    if match_id.rsplit('_', 1)[0] != edge['source'].rsplit('_', 1)[0] or len(matches) == 1:
+                        resolved_edges.append({
+                            **edge,
+                            'target': match_id,
+                            'confidence': 'INFERRED' if len(matches) > 1 else 'EXTRACTED',
+                            'confidence_score': 0.9 if len(matches) == 1 else 0.6,
+                        })
+                        calls_resolved += 1
+            else:
+                dropped += 1
+        elif rel == 'imports':
+            target = edge['target']
+            # Try exact match as node ID first
+            if target in node_ids:
+                resolved_edges.append(edge)
+                imports_resolved += 1
+                continue
+            # Resolve dotted module path to file node ID
+            resolved_id = module_index.get(target)
+            if resolved_id:
+                resolved_edges.append({**edge, 'target': resolved_id})
+                imports_resolved += 1
+                continue
+            # Try progressively shorter prefixes
+            # e.g. 'app.core.session.revoke_all' → 'app.core.session' → 'app.core' → 'app'
+            parts = target.split('.')
+            found = False
+            for i in range(len(parts) - 1, 0, -1):
+                prefix = '.'.join(parts[:i])
+                resolved_id = module_index.get(prefix)
+                if resolved_id:
+                    resolved_edges.append({**edge, 'target': resolved_id})
+                    imports_resolved += 1
+                    found = True
+                    break
+            if not found:
+                # External/stdlib import — drop it
+                dropped += 1
+        else:
+            # defines, has_rationale, etc — keep as-is
+            resolved_edges.append(edge)
+    print(f"  Resolved: {calls_resolved} calls, {imports_resolved} imports, {dropped} dropped (external/stdlib)")
+    return resolved_edges
+def run_ast_extraction(files, use_cache=True):
+    """Run AST extraction on all Python files, with caching."""
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    all_nodes = []
+    all_edges = []
+    cached, extracted = 0, 0
+    # Collect valid cache hashes for cleanup
+    valid_hashes = set()
+    py_files = [f for f in files if f['ext'] in AST_EXTENSIONS]
+    for f in py_files:
+        fpath = ROOT / f['path']
+        fhash = hash_file(fpath)
+        valid_hashes.add(fhash)
+        cache_file = CACHE_DIR / f"{fhash}.json"
+        if use_cache and cache_file.exists():
+            data = json.loads(cache_file.read_text(encoding='utf-8'))
+            all_nodes.extend(data.get('nodes', []))
+            all_edges.extend(data.get('edges', []))
+            cached += 1
+        else:
+            nodes, edges = extract_ast_file(fpath)
+            all_nodes.extend(nodes)
+            all_edges.extend(edges)
+            # Write cache
+            cache_file.write_text(json.dumps({
+                'nodes': nodes, 'edges': edges,
+            }, indent=2), encoding='utf-8')
+            extracted += 1
+    # Clean stale cache entries
+    stale = 0
+    for cache_file in CACHE_DIR.glob('*.json'):
+        h = cache_file.stem
+        if h not in valid_hashes:
+            cache_file.unlink()
+            stale += 1
+    print(f"  AST: {len(py_files)} Python files ({cached} cached, {extracted} extracted)")
+    if stale:
+        print(f"  Cache cleanup: {stale} stale entries removed")
+    print(f"  AST: {len(all_nodes)} nodes, {len(all_edges)} edges (raw)")
+    # Resolve bare targets to actual node IDs
+    all_edges = _resolve_edges(all_nodes, all_edges)
+    print(f"  AST: {len(all_nodes)} nodes, {len(all_edges)} edges (resolved)")
+    return all_nodes, all_edges
+# ─── Step 3: Semantic Extraction ─────────────────────────────────────────────
+def build_semantic_nodes():
+    """
+    Build semantic nodes from documentation files.
+    These capture high-level architecture concepts that AST can't see.
+    """
+    nodes = []
+    edges = []
+    hyperedges = []
+    # Architecture components from README
+    arch_nodes = [
+        ("nudr_api", "NudR Stateless API", "README.md"),
+        ("fastapi_backend", "FastAPI Stateless Backend", "README.md"),
+        ("supabase_db", "Supabase PostgreSQL Database", "README.md"),
+        ("redis_cache", "Redis Session & Cache Store", "README.md"),
+        ("cloudflare_proxy", "Cloudflare Edge Proxy", "README.md"),
+        ("stripe_payments", "Stripe Payment Integration", "README.md"),
+        ("firebase_fcm", "Firebase FCM Push Notifications", "README.md"),
+        ("e2ee_encryption", "E2EE X25519 Key Exchange", "README.md"),
+        ("protobuf_framing", "Protobuf Binary WebSocket Framing", "README.md"),
+        ("hmac_verification", "HMAC-SHA256 Request Verification", "README.md"),
+        ("origin_secret", "X-Origin-Secret Middleware", "README.md"),
+        ("pow_challenge", "Proof-of-Work Challenge", "README.md"),
+        ("rate_limiting", "Per-IP Rate Limiting", "README.md"),
+        ("aws_secrets", "AWS Secrets Manager Integration", "README.md"),
+        ("terraform_infra", "Terraform AWS Infrastructure", "README.md"),
+        ("vpc_network", "VPC Network Topology", "README.md"),
+        ("alb_autoscaling", "ALB + Auto Scaling Group", "README.md"),
+        ("lambda_rotator", "Lambda Origin Secret Rotator", "README.md"),
+        ("unified_ws", "Unified WebSocket Endpoint /ws", "README.md"),
+        ("feed_ws", "Feed WebSocket Channel", "README.md"),
+        ("chat_ws", "Chat WebSocket Channel", "README.md"),
+        ("keysync_ws", "Keysync WebSocket Channel", "README.md"),
+        ("discovery_ws", "Discovery WebSocket Channel", "README.md"),
+        ("attack_detection", "Attack Detection & IP Risk Management", "README.md"),
+    ]
+    for nid, label, src in arch_nodes:
+        nodes.append({
+            'id': f"sem_{nid}", 'label': label,
+            'file_type': 'document', 'source_file': src,
+        })
+    # Architecture edges
+    arch_edges = [
+        ("nudr_api", "fastapi_backend", "implements"),
+        ("fastapi_backend", "supabase_db", "references"),
+        ("fastapi_backend", "redis_cache", "references"),
+        ("cloudflare_proxy", "origin_secret", "references"),
+        ("origin_secret", "lambda_rotator", "references"),
+        ("stripe_payments", "fastapi_backend", "references"),
+        ("firebase_fcm", "fastapi_backend", "references"),
+        ("e2ee_encryption", "keysync_ws", "references"),
+        ("protobuf_framing", "unified_ws", "references"),
+        ("terraform_infra", "vpc_network", "references"),
+        ("terraform_infra", "alb_autoscaling", "references"),
+        ("terraform_infra", "aws_secrets", "references"),
+        ("attack_detection", "rate_limiting", "references"),
+        ("unified_ws", "feed_ws", "conceptually_related_to"),
+        ("unified_ws", "chat_ws", "conceptually_related_to"),
+        ("unified_ws", "keysync_ws", "conceptually_related_to"),
+        ("unified_ws", "discovery_ws", "conceptually_related_to"),
+    ]
+    for src, tgt, rel in arch_edges:
+        edges.append({
+            'source': f"sem_{src}", 'target': f"sem_{tgt}",
+            'relation': rel,
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': 'README.md', 'weight': 1.0,
+        })
+    # Feed system nodes (from feed_system_documentation.md)
+    feed_nodes = [
+        ("feed_system", "Feed System Technical Documentation", "PLAN/feed_system_documentation.md"),
+        ("feed_scoring", "Multi-Factor Scoring Algorithm", "PLAN/feed_system_documentation.md"),
+        ("feed_pool", "Feed Pool Computation Pipeline", "PLAN/feed_system_documentation.md"),
+        ("feed_filters", "Feed Hard Filters (12 Rules)", "PLAN/feed_system_documentation.md"),
+        ("feed_heatmap", "Preference Heatmap (Learned AI)", "PLAN/feed_system_documentation.md"),
+        ("feed_reciprocal", "Reciprocal Boost & Injection", "PLAN/feed_system_documentation.md"),
+        ("feed_gradient", "3-Tier Gradient Distribution", "PLAN/feed_system_documentation.md"),
+        ("feed_redis", "Feed Redis Key Schema", "PLAN/feed_system_documentation.md"),
+    ]
+    for nid, label, src in feed_nodes:
+        nodes.append({
+            'id': f"sem_{nid}", 'label': label,
+            'file_type': 'document', 'source_file': src,
+        })
+    feed_edges = [
+        ("feed_system", "nudr_api", "references"),
+        ("feed_pool", "redis_cache", "references"),
+        ("feed_pool", "supabase_db", "references"),
+        ("feed_scoring", "feed_pool", "references"),
+        ("feed_filters", "feed_pool", "references"),
+        ("feed_heatmap", "feed_scoring", "references"),
+        ("feed_reciprocal", "feed_scoring", "references"),
+        ("feed_gradient", "feed_scoring", "references"),
+        ("feed_redis", "redis_cache", "references"),
+        ("feed_system", "feed_ws", "references"),
+    ]
+    for src, tgt, rel in feed_edges:
+        edges.append({
+            'source': f"sem_{src}", 'target': f"sem_{tgt}",
+            'relation': rel,
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': 'PLAN/feed_system_documentation.md', 'weight': 1.0,
+        })
+    # Logic analysis nodes
+    logic_nodes = [
+        ("logic_analysis", "Logic-Level Async Issue Audit", "PLAN/LOGIC_ANALYSIS.md"),
+        ("id_ws_reuse", "DISASTROUS: id(ws) Memory Reuse Bug", "PLAN/LOGIC_ANALYSIS.md"),
+        ("token_refresh_crash", "DISASTROUS: Token Refresh Crash Window", "PLAN/LOGIC_ANALYSIS.md"),
+        ("pubsub_crash", "DISASTROUS: PubSub Listener Permanent Crash", "PLAN/LOGIC_ANALYSIS.md"),
+        ("redis_pool_exhaustion", "DISASTROUS: Redis Connection Pool Exhaustion", "PLAN/LOGIC_ANALYSIS.md"),
+        ("preference_race", "Race Condition: Preference Merge", "PLAN/LOGIC_ANALYSIS.md"),
+    ]
+    for nid, label, src in logic_nodes:
+        nodes.append({
+            'id': f"sem_{nid}", 'label': label,
+            'file_type': 'document', 'source_file': src,
+        })
+    logic_edges = [
+        ("id_ws_reuse", "unified_ws", "references"),
+        ("token_refresh_crash", "unified_ws", "references"),
+        ("pubsub_crash", "redis_cache", "references"),
+        ("redis_pool_exhaustion", "redis_cache", "references"),
+        ("preference_race", "supabase_db", "references"),
+        ("logic_analysis", "nudr_api", "references"),
+    ]
+    for src, tgt, rel in logic_edges:
+        edges.append({
+            'source': f"sem_{src}", 'target': f"sem_{tgt}",
+            'relation': rel,
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': 'PLAN/LOGIC_ANALYSIS.md', 'weight': 1.0,
+        })
+    # Hyperedges
+    hyperedges = [
+        {
+            'id': 'websocket_channels',
+            'label': 'WebSocket Channel System',
+            'nodes': ['sem_unified_ws', 'sem_feed_ws', 'sem_chat_ws', 'sem_keysync_ws', 'sem_discovery_ws'],
+            'relation': 'participate_in',
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': 'README.md',
+        },
+        {
+            'id': 'security_stack',
+            'label': 'Security Defense Stack',
+            'nodes': ['sem_hmac_verification', 'sem_origin_secret', 'sem_pow_challenge', 'sem_rate_limiting', 'sem_attack_detection'],
+            'relation': 'participate_in',
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': 'README.md',
+        },
+        {
+            'id': 'feed_pipeline',
+            'label': 'Feed Recommendation Pipeline',
+            'nodes': ['sem_feed_pool', 'sem_feed_filters', 'sem_feed_scoring', 'sem_feed_heatmap', 'sem_feed_reciprocal', 'sem_feed_gradient'],
+            'relation': 'form',
+            'confidence': 'EXTRACTED', 'confidence_score': 1.0,
+            'source_file': 'PLAN/feed_system_documentation.md',
+        },
+    ]
+    print(f"  Semantic: {len(nodes)} nodes, {len(edges)} edges, {len(hyperedges)} hyperedges")
+    return nodes, edges, hyperedges
+# ─── Step 4: Merge & Build Graph ─────────────────────────────────────────────
+def merge_and_build(ast_nodes, ast_edges, sem_nodes, sem_edges, hyperedges):
+    """Merge AST + semantic, build NetworkX graph, cluster, analyze."""
+    from graphify.build import build_from_json
+    from graphify.cluster import cluster, score_all
+    from graphify.analyze import god_nodes, surprising_connections, suggest_questions
+    # Merge: AST first, deduplicate semantic by id
+    seen = {n['id'] for n in ast_nodes}
+    merged_nodes = list(ast_nodes)
+    for n in sem_nodes:
+        if n['id'] not in seen:
+            merged_nodes.append(n)
+            seen.add(n['id'])
+    merged_edges = ast_edges + sem_edges
+    extraction = {
+        'nodes': merged_nodes,
+        'edges': merged_edges,
+        'hyperedges': hyperedges,
+    }
+    G = build_from_json(extraction)
+    communities = cluster(G)
+    cohesion = score_all(G, communities)
+    gods = god_nodes(G)
+    surprises = surprising_connections(G, communities)
+    # Auto-label communities
+    labels = {}
+    for cid, members in communities.items():
+        names = " ".join(members[:10]).lower()
+        if 'feed' in names and 'service' in names:
+            labels[cid] = "Feed System"
+        elif 'feed' in names and ('score' in names or 'pool' in names):
+            labels[cid] = "Feed Scoring & Pool"
+        elif 'chat' in names and ('ws' in names or 'websocket' in names):
+            labels[cid] = "Chat WebSocket"
+        elif 'keysync' in names or 'key_exchange' in names:
+            labels[cid] = "Key Exchange & Sync"
+        elif 'discovery' in names and ('match' in names or 'like' in names):
+            labels[cid] = "Discovery & Matching"
+        elif 'auth' in names or 'signup' in names or 'signin' in names:
+            labels[cid] = "Authentication"
+        elif 'payment' in names or 'stripe' in names:
+            labels[cid] = "Payments & Billing"
+        elif 'setting' in names or 'profile' in names or 'preference' in names:
+            labels[cid] = "Settings & Profiles"
+        elif 'consent' in names:
+            labels[cid] = "Consent System"
+        elif 'report' in names or 'violation' in names:
+            labels[cid] = "Reporting & Moderation"
+        elif 'notification' in names or 'fcm' in names:
+            labels[cid] = "Push Notifications"
+        elif 'redis' in names or 'cache' in names:
+            labels[cid] = "Redis & Caching"
+        elif 'supabase' in names or 'migration' in names:
+            labels[cid] = "Database Layer"
+        elif 'terraform' in names or 'aws' in names or 'vpc' in names:
+            labels[cid] = "Infrastructure (Terraform)"
+        elif 'security' in names or 'rate_limit' in names or 'attack' in names:
+            labels[cid] = "Security & Rate Limiting"
+        elif 'codec' in names or 'hmac' in names or 'protobuf' in names:
+            labels[cid] = "WebSocket Codec"
+        elif 'unified' in names and 'ws' in names:
+            labels[cid] = "Unified WebSocket"
+        elif 'token' in names:
+            labels[cid] = "Token Management"
+        elif 'image' in names:
+            labels[cid] = "Image Processing"
+        elif 'event' in names or 'pending' in names:
+            labels[cid] = "Event Queue"
+        elif 'linkup' in names:
+            labels[cid] = "Linkup System"
+        elif 'test' in names:
+            labels[cid] = "Tests"
+        elif 'nuke' in names or 'script' in names:
+            labels[cid] = "Utility Scripts"
+        elif 'email' in names or 'otp' in names:
+            labels[cid] = "Email & OTP"
+        elif 'flutter' in names:
+            labels[cid] = "Flutter Directives"
+        elif 'readme' in names:
+            labels[cid] = "API Documentation"
+        else:
+            labels[cid] = f"Module Group {cid}"
+    questions = suggest_questions(G, communities, labels)
+    print(f"  Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {len(communities)} communities")
+    return G, communities, cohesion, labels, gods, surprises, questions, extraction
+# ─── Step 5: Generate Outputs ────────────────────────────────────────────────
+def generate_outputs(G, communities, cohesion, labels, gods, surprises, questions, detection, extraction):
+    """Generate report, HTML, JSON, and manifest."""
+    from graphify.report import generate
+    from graphify.export import to_json, to_html
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    tokens = {'input': 0, 'output': 0}
+    # Report
+    report = generate(
+        G, communities, cohesion, labels, gods, surprises,
+        detection, tokens, str(ROOT), suggested_questions=questions,
+    )
+    REPORT_PATH.write_text(report, encoding='utf-8')
+    print(f"  -> {REPORT_PATH.relative_to(ROOT)}")
+    # JSON
+    to_json(G, communities, str(JSON_PATH))
+    print(f"  -> {JSON_PATH.relative_to(ROOT)}")
+    # HTML
+    if G.number_of_nodes() <= 5000:
+        to_html(G, communities, str(HTML_PATH), community_labels=labels)
+        print(f"  -> {HTML_PATH.relative_to(ROOT)}")
+    else:
+        print(f"  !! Graph too large for HTML ({G.number_of_nodes()} nodes)")
+    # Manifest
+    manifest = {}
+    for f in detection.get('files', []):
+        manifest[f['path']] = f.get('mtime', 0)
+    MANIFEST.write_text(json.dumps(manifest, indent=2), encoding='utf-8')
+    # Cost tracker
+    if COST_PATH.exists():
+        cost = json.loads(COST_PATH.read_text(encoding='utf-8'))
+    else:
+        cost = {'runs': [], 'total_input_tokens': 0, 'total_output_tokens': 0}
+    cost['runs'].append({
+        'date': datetime.now(timezone.utc).isoformat(),
+        'nodes': G.number_of_nodes(),
+        'edges': G.number_of_edges(),
+        'communities': len(communities),
+    })
+    COST_PATH.write_text(json.dumps(cost, indent=2), encoding='utf-8')
+# ─── Main Pipeline ───────────────────────────────────────────────────────────
+def run_pipeline(skip_semantic=False):
+    """Execute the full graphify pipeline."""
+    start = time.time()
+    print("=" * 60)
+    print(f"graphify rebuild — {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("=" * 60)
+    # Step 1: Detect
+    print("\n[1/5] Detecting files...")
+    files, total_words = detect_files()
+    changed = get_changed_files(files)
+    print(f"  Found {len(files)} files ({total_words:,} words)")
+    print(f"  Changed since last build: {len(changed)}")
+    detection = {
+        'files': files,
+        'total_files': len(files),
+        'total_words': total_words,
+        'changed_files': len(changed),
+    }
+    # Step 2: AST extraction
+    print("\n[2/5] AST extraction...")
+    ast_nodes, ast_edges = run_ast_extraction(files)
+    # Step 3: Semantic extraction
+    if skip_semantic:
+        print("\n[3/5] Semantic extraction... SKIPPED (--quick)")
+        sem_nodes, sem_edges, hyperedges = [], [], []
+    else:
+        print("\n[3/5] Semantic extraction...")
+        sem_nodes, sem_edges, hyperedges = build_semantic_nodes()
+    # Step 4: Merge & build
+    print("\n[4/5] Building graph...")
+    G, communities, cohesion, labels, gods, surprises, questions, extraction = \
+        merge_and_build(ast_nodes, ast_edges, sem_nodes, sem_edges, hyperedges)
+    # Step 5: Generate outputs
+    print("\n[5/5] Generating outputs...")
+    generate_outputs(G, communities, cohesion, labels, gods, surprises, questions, detection, extraction)
+    elapsed = time.time() - start
+    print(f"\n{'=' * 60}")
+    print(f"Done in {elapsed:.1f}s")
+    print(f"  {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {len(communities)} communities")
+    print(f"  Open graphify-out/graph.html in your browser")
+    print(f"{'=' * 60}")
+def watch_mode():
+    """Watch for file changes and rebuild automatically."""
+    print("Watching for changes... (Ctrl+C to stop)")
+    last_mtimes = {}
+    while True:
+        try:
+            changed = False
+            for dirpath, dirnames, filenames in os.walk(ROOT):
+                dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
+                for fname in filenames:
+                    fpath = Path(dirpath) / fname
+                    if fpath.suffix.lower() not in CORPUS_EXTENSIONS:
+                        continue
+                    try:
+                        mtime = fpath.stat().st_mtime
+                    except OSError:
+                        continue
+                    key = str(fpath)
+                    if key in last_mtimes and last_mtimes[key] != mtime:
+                        rel = fpath.relative_to(ROOT)
+                        print(f"\n  Changed: {rel}")
+                        changed = True
+                    last_mtimes[key] = mtime
+            if changed:
+                run_pipeline()
+            time.sleep(3)
+        except KeyboardInterrupt:
+            print("\nStopped watching.")
+            break
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='NudR Knowledge Graph Rebuild')
+    parser.add_argument('--watch', action='store_true', help='Watch mode: rebuild on file change')
+    parser.add_argument('--quick', action='store_true', help='Quick mode: AST-only, skip semantic')
+    args = parser.parse_args()
+    if args.watch:
+        run_pipeline(skip_semantic=args.quick)
+        watch_mode()
+    else:
+        run_pipeline(skip_semantic=args.quick)

graphify-out/GRAPH_REPORT.md ADDED Viewed

	@@ -0,0 +1,252 @@

+# Graph Report - /run/media/morpheuslord/Personal_Files/Projects/Rewriter  (2026-05-03)
+## Corpus Check
+- 442 files · ~1,967,332 words
+- Verdict: corpus is large enough that graph structure adds value.
+## Summary
+- 549 nodes · 873 edges · 27 communities detected
+- Extraction: 76% EXTRACTED · 24% INFERRED · 0% AMBIGUOUS · INFERRED: 208 edges (avg confidence: 0.6)
+- Token cost: 0 input · 0 output
+## Community Hubs (Navigation)
+- [[_COMMUNITY_Module Group 0|Module Group 0]]
+- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
+- [[_COMMUNITY_Module Group 2|Module Group 2]]
+- [[_COMMUNITY_Module Group 3|Module Group 3]]
+- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
+- [[_COMMUNITY_Module Group 5|Module Group 5]]
+- [[_COMMUNITY_Token Management|Token Management]]
+- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
+- [[_COMMUNITY_Authentication|Authentication]]
+- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
+- [[_COMMUNITY_Module Group 10|Module Group 10]]
+- [[_COMMUNITY_Feed Scoring & Pool|Feed Scoring & Pool]]
+- [[_COMMUNITY_Module Group 12|Module Group 12]]
+- [[_COMMUNITY_Token Management|Token Management]]
+- [[_COMMUNITY_Module Group 14|Module Group 14]]
+- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
+- [[_COMMUNITY_Module Group 16|Module Group 16]]
+- [[_COMMUNITY_Module Group 17|Module Group 17]]
+- [[_COMMUNITY_Module Group 18|Module Group 18]]
+- [[_COMMUNITY_Module Group 19|Module Group 19]]
+- [[_COMMUNITY_Module Group 20|Module Group 20]]
+- [[_COMMUNITY_Infrastructure (Terraform)|Infrastructure (Terraform)]]
+- [[_COMMUNITY_Utility Scripts|Utility Scripts]]
+- [[_COMMUNITY_Module Group 23|Module Group 23]]
+- [[_COMMUNITY_Security & Rate Limiting|Security & Rate Limiting]]
+- [[_COMMUNITY_WebSocket Codec|WebSocket Codec]]
+- [[_COMMUNITY_Module Group 27|Module Group 27]]
+## God Nodes (most connected - your core abstractions)
+1. `train()` - 34 edges
+2. `__init__()` - 28 edges
+3. `__init__()` - 27 edges
+4. `__init__()` - 27 edges
+5. `__init__()` - 27 edges
+6. `__init__()` - 27 edges
+7. `__init__()` - 27 edges
+8. `__init__()` - 27 edges
+9. `correct()` - 16 edges
+10. `__init__()` - 13 edges
+## Surprising Connections (you probably didn't know these)
+- `run_inference()` --calls--> `correct()`  [INFERRED]
+  scripts/run_inference.py → src/preprocessing/spell_corrector.py
+- `train()` --calls--> `__init__()`  [INFERRED]
+  scripts/train.py → src/training/dataset.py
+- `__init__()` --calls--> `__init__()`  [INFERRED]
+  scripts/train.py → src/training/dataset.py
+- `score()` --calls--> `forward()`  [INFERRED]
+  src/training/human_pattern_extractor.py → scripts/train.py
+- `test_spell_correction_empty()` --calls--> `correct()`  [INFERRED]
+  tests/test_preprocessing.py → src/inference/corrector.py
+## Hyperedges (group relationships)
+- **WebSocket Channel System** — sem_unified_ws, sem_feed_ws, sem_chat_ws, sem_keysync_ws, sem_discovery_ws [EXTRACTED 1.00]
+- **Security Defense Stack** — sem_hmac_verification, sem_origin_secret, sem_pow_challenge, sem_rate_limiting, sem_attack_detection [EXTRACTED 1.00]
+- **Feed Recommendation Pipeline** — sem_feed_pool, sem_feed_filters, sem_feed_scoring, sem_feed_heatmap, sem_feed_reciprocal, sem_feed_gradient [EXTRACTED 1.00]
+## Communities
+### Community 0 - "Module Group 0"
+Cohesion: 0.04
+Nodes (55): EntitySpan, NERTagger, Tags named entities and produces protected spans., Named Entity Recognition tagger.
+Identifies entities (persons, locations, organi, get_protected_spans(), Return (start, end) char spans that must not be modified., tag(), Extract all named entities from text. (+47 more)
+### Community 1 - "Utility Scripts"
+Cohesion: 0.06
+Nodes (38): Evaluation script.
+Runs all evaluation metrics on the test set.
+Run: python scri, evaluate(), Run evaluation on the specified data split., ERRANTEvaluator, Evaluates grammar correction quality using ERRANT annotations., ERRANT-based grammatical error evaluation.
+Uses the ERRANT toolkit for standardi, evaluate(), Compute ERRANT precision, recall, F0.5. (+30 more)
+### Community 2 - "Module Group 2"
+Cohesion: 0.07
+Nodes (36): StyleFingerprinter, Extracts style fingerprint vectors from text samples., StyleProjectionMLP, Projects raw feature vector to 512-dim style embedding., _avg_dep_tree_depth(), Compute average dependency tree depth across all tokens., _avg_syllables_per_word(), Average syllables per word. (+28 more)
+### Community 3 - "Module Group 3"
+Cohesion: 0.06
+Nodes (35): AWLLoader, Loads and manages Academic Word List data., _load_synonyms(), Load academic synonym mappings from JSON., _load_word_list(), Load a word list file into a set of lowercase words., all_words(), Return the full set of academic words. (+27 more)
+### Community 4 - "Utility Scripts"
+Cohesion: 0.31
+Nodes (34): __init__(), CEOnlyLoss, Cross-entropy only loss — the only loss that provides gradient signal., __init__(), _auto_batch_size(), Pick optimal batch size based on model size and available resources., _setup_device(), Detect GPU and configure hybrid VRAM management.
+Returns (device, gpu_info) whe (+26 more)
+### Community 5 - "Module Group 5"
+Cohesion: 0.08
+Nodes (29): DyslexiaSimulator, Generates synthetic dyslectic text from clean input for data augmentation., _double_letter(), Double a random interior letter., _omit_letter(), Remove a random interior letter., _reverse_letter(), Swap b/d, p/q style reversals. (+21 more)
+### Community 6 - "Token Management"
+Cohesion: 0.07
+Nodes (28): Loads and wraps the base pretrained model.
+Supported architectures:
+  - google/f, load_model_and_tokenizer(), Load a pretrained model with optional LoRA and quantization.
+Args:
+    model_ke, apply_lora(), Apply LoRA adapters to a model and return the wrapped model., create_lora_config(), Create a LoRA configuration for the given task type., LoRA adapter configuration and management.
+Wraps PEFT LoRA utilities for applyin (+20 more)
+### Community 7 - "Utility Scripts"
+Cohesion: 0.08
+Nodes (28): Pre-trains the HumanPatternClassifier on both Kaggle datasets.
+Run this BEFORE t, train_classifier(), Pre-train the human pattern classifier on Kaggle datasets., forward(), HumanPatternClassifier, Lightweight MLP trained to distinguish human from AI writing.
+Input: feature vec, HumanPatternFeatureExtractor, Extracts 17-dimensional feature vector encoding human vs AI writing patterns.
+O (+20 more)
+### Community 8 - "Authentication"
+Cohesion: 0.08
+Nodes (27): AuthorshipVerifier, Verifies authorship consistency between input and output text., Authorship verification module.
+Uses a fine-tuned model to verify whether the co, verify(), Return probability that both texts were written by the same author.
+Uses senten, average_style_vectors(), Compute the mean style vector from a list of vectors., cosine_similarity() (+19 more)
+### Community 9 - "Utility Scripts"
+Cohesion: 0.08
+Nodes (25): Interactive inference script.
+Run: python scripts/run_inference.py --config conf, run_inference(), Run inference on text input., correct_text(), Correct dyslectic text with style preservation and academic elevation., FastAPI server for the Dyslexia Academic Writing Corrector API.
+Provides RESTful, health(), Health check endpoint. (+17 more)
+### Community 10 - "Module Group 10"
+Cohesion: 0.1
+Nodes (27): _get_call_name(), Extract callable name from ast.Call node., _get_name(), Extract name from various AST node types., _resolve_edges(), Post-process edges to resolve bare names to actual node IDs.
+The per-file AST e, build_semantic_nodes(), Build semantic nodes from documentation files.
+These capture high-level architec (+19 more)
+### Community 11 - "Feed Scoring & Pool"
+Cohesion: 0.08
+Nodes (27): Chat WebSocket Channel, Discovery WebSocket Channel, E2EE X25519 Key Exchange, FastAPI Stateless Backend, Feed Hard Filters (12 Rules), 3-Tier Gradient Distribution, Preference Heatmap (Learned AI), Feed Pool Computation Pipeline (+19 more)
+### Community 12 - "Module Group 12"
+Cohesion: 0.12
+Nodes (22): GLEU, (Note: This script computes sentence-level GLEU score.)
+This script calculates , get_gleu_stats(), calculate mean and confidence interval from all GLEU iterations, get_ngram_counts(), get ngrams of order n for a tokenized sentence, get_ngram_diff(), returns ngrams in a but not in b (+14 more)
+### Community 13 - "Token Management"
+Cohesion: 0.16
+Nodes (17): clean_para(), convert_char_to_tok(), get_all_tok_starts_and_ends(), get_paras(), get_sents(), get_token_edits(), main(), noop_edit() (+9 more)
+### Community 14 - "Module Group 14"
+Cohesion: 0.13
+Nodes (14): FormalityClassifier, Scores text formality on a 0-1 scale using rule-based heuristics., Formality classifier module.
+Classifies text on a 0-1 formality scale using ling, score(), Return formality score in [0, 1]. Higher = more formal.
+Scoring based on:
+- Con, RegisterFilterAdvanced, Advanced register filtering with nominalisation and hedging passes., add_hedging() (+6 more)
+### Community 15 - "Utility Scripts"
+Cohesion: 0.2
+Nodes (14): apply_bea19_edits(), Apply BEA-2019 character-level edits to produce corrected text.
+edits_block for, create_splits(), Split train.jsonl into train and val sets., Converts all raw dataset formats into unified JSONL training format.
+Output sche, main(), process_bea19_json(), Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
+Each line is a JSON ob (+6 more)
+### Community 16 - "Module Group 16"
+Cohesion: 0.24
+Nodes (9): CorrectionTrainer, Custom trainer — uses model's built-in loss directly., _strip_custom_fields(), Remove dataset fields that T5 doesn't accept., compute_loss(), Use model's built-in CE loss — avoids double-computing logits loss., Custom HuggingFace Trainer subclass.
+Uses the model's built-in cross-entropy los, prediction_step() (+1 more)
+### Community 17 - "Module Group 17"
+Cohesion: 0.29
+Nodes (5): RateLimitMiddleware, Simple in-memory rate limiting., RequestLoggingMiddleware, Logs all incoming requests with timing information., API middleware for request logging, rate limiting, and error handling.
+### Community 18 - "Module Group 18"
+Cohesion: 0.29
+Nodes (5): EarlyStoppingOnStyleDrift, Stops training if style similarity drops below threshold., StyleMetricsCallback, Logs style similarity metrics during evaluation., Training callbacks for monitoring and checkpointing.
+Integrates with Weights & B
+### Community 19 - "Module Group 19"
+Cohesion: 0.33
+Nodes (5): EmotionClassifier, Classifies emotional register of text using keyword-based analysis., classify(), Return emotion distribution over register categories.
+Returns a dict with keys:, Emotion/register classifier module.
+Classifies text emotional register (neutral,
+### Community 20 - "Module Group 20"
+Cohesion: 0.5
+Nodes (3): CorrectionRequest, CorrectionResponse, Pydantic schemas for API request/response validation.
+### Community 21 - "Infrastructure (Terraform)"
+Cohesion: 0.5
+Nodes (4): ALB + Auto Scaling Group, AWS Secrets Manager Integration, Terraform AWS Infrastructure, VPC Network Topology
+### Community 22 - "Utility Scripts"
+Cohesion: 0.67
+Nodes (1): Downloads all publicly available HuggingFace datasets automatically.
+Datasets re
+### Community 23 - "Module Group 23"
+Cohesion: 0.67
+Nodes (3): Cloudflare Edge Proxy, Lambda Origin Secret Rotator, X-Origin-Secret Middleware
+### Community 24 - "Security & Rate Limiting"
+Cohesion: 1.0
+Nodes (2): Attack Detection & IP Risk Management, Per-IP Rate Limiting
+### Community 26 - "WebSocket Codec"
+Cohesion: 1.0
+Nodes (1): HMAC-SHA256 Request Verification
+### Community 27 - "Module Group 27"
+Cohesion: 1.0
+Nodes (1): Proof-of-Work Challenge
+## Knowledge Gaps
+- **259 isolated node(s):** `graphify_rebuild.py — One-shot NudR knowledge graph regeneration.
+Usage:
+    py`, `Walk the project and return list of relevant files with metadata.`, `Compare against manifest to find changed files.`, `SHA-256 hash for cache keying.`, `Extract AST nodes and edges from a single Python file.` (+254 more)
+  These have ≤1 connection - possible missing edges or undocumented components.
+- **Thin community `Utility Scripts`** (3 nodes): `download_all_huggingface_datasets.py`, `Downloads all publicly available HuggingFace datasets automatically.
+Datasets re`, `main()`
+  Too small to be a meaningful cluster - may be noise or needs more connections extracted.
+- **Thin community `Security & Rate Limiting`** (2 nodes): `Attack Detection & IP Risk Management`, `Per-IP Rate Limiting`
+  Too small to be a meaningful cluster - may be noise or needs more connections extracted.
+- **Thin community `WebSocket Codec`** (1 nodes): `HMAC-SHA256 Request Verification`
+  Too small to be a meaningful cluster - may be noise or needs more connections extracted.
+- **Thin community `Module Group 27`** (1 nodes): `Proof-of-Work Challenge`
+  Too small to be a meaningful cluster - may be noise or needs more connections extracted.
+## Suggested Questions
+_Questions this graph is uniquely positioned to answer:_
+- **Why does `parse()` connect `Token Management` to `Utility Scripts`, `Module Group 10`?**
+  _High betweenness centrality (0.125) - this node is a cross-community bridge._
+- **Why does `correct()` connect `Utility Scripts` to `Module Group 0`, `Utility Scripts`, `Module Group 2`, `Module Group 3`?**
+  _High betweenness centrality (0.092) - this node is a cross-community bridge._
+- **Why does `extract_ast_file()` connect `Module Group 10` to `Token Management`?**
+  _High betweenness centrality (0.083) - this node is a cross-community bridge._
+- **Are the 26 inferred relationships involving `train()` (e.g. with `__init__()` and `__init__()`) actually correct?**
+  _`train()` has 26 INFERRED edges - model-reasoned connections that need verification._
+- **Are the 26 inferred relationships involving `__init__()` (e.g. with `train()` and `__init__()`) actually correct?**
+  _`__init__()` has 26 INFERRED edges - model-reasoned connections that need verification._
+- **Are the 26 inferred relationships involving `__init__()` (e.g. with `train()` and `__init__()`) actually correct?**
+  _`__init__()` has 26 INFERRED edges - model-reasoned connections that need verification._
+- **Are the 26 inferred relationships involving `__init__()` (e.g. with `train()` and `__init__()`) actually correct?**
+  _`__init__()` has 26 INFERRED edges - model-reasoned connections that need verification._

graphify-out/cost.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "runs": [
+    {
+      "date": "2026-05-02T14:10:36.766309+00:00",
+      "nodes": 527,
+      "edges": 791,
+      "communities": 27
+    },
+    {
+      "date": "2026-05-02T14:38:36.641525+00:00",
+      "nodes": 527,
+      "edges": 791,
+      "communities": 27
+    },
+    {
+      "date": "2026-05-02T15:18:12.036397+00:00",
+      "nodes": 535,
+      "edges": 803,
+      "communities": 26
+    },
+    {
+      "date": "2026-05-02T15:51:26.719125+00:00",
+      "nodes": 541,
+      "edges": 861,
+      "communities": 27
+    },
+    {
+      "date": "2026-05-03T09:17:56.530188+00:00",
+      "nodes": 549,
+      "edges": 873,
+      "communities": 28
+    }
+  ],
+  "total_input_tokens": 0,
+  "total_output_tokens": 0
+}

graphify-out/graph.html ADDED Viewed

The diff for this file is too large to render. See raw diff

graphify-out/graph.json ADDED Viewed

The diff for this file is too large to render. See raw diff

graphify-out/manifest.json ADDED Viewed

	@@ -0,0 +1,444 @@

+{
+  ".gitignore": 1777701225.48157,
+  ".train_stage1_done": 1777720832.7039733,
+  "docker-compose.yml": 1777700239.9815593,
+  "Dockerfile": 1777700223.9820845,
+  "graph_codebase.py": 1777726565.2849488,
+  "Plan.MD": 1777699641.0691116,
+  "pyproject.toml": 1777700221.3256943,
+  "README.md": 1777724083.753812,
+  "requirements-dev.txt": 1777700217.251423,
+  "requirements.txt": 1777711524.5703418,
+  "start.sh": 1777710701.3410938,
+  "todo_registry.md": 1777710900.2597346,
+  "train.sh": 1777714202.448383,
+  "checkpoints/human_pattern_classifier.pt": 1777729878.0041149,
+  "checkpoints/best_model/adapter_config.json": 1777796825.1490068,
+  "checkpoints/best_model/adapter_model.safetensors": 1777796825.1843758,
+  "checkpoints/best_model/README.md": 1777796719.1975436,
+  "checkpoints/best_model/special_tokens_map.json": 1777796824.250464,
+  "checkpoints/best_model/spiece.model": 1777796824.2559996,
+  "checkpoints/best_model/tokenizer.json": 1777796824.275346,
+  "checkpoints/best_model/tokenizer_config.json": 1777796824.2501526,
+  "checkpoints/best_model_merged/config.json": 1777798859.073088,
+  "checkpoints/best_model_merged/generation_config.json": 1777798859.0735242,
+  "checkpoints/best_model_merged/model.safetensors": 1777798860.1068735,
+  "checkpoints/best_model_merged/special_tokens_map.json": 1777798860.7994945,
+  "checkpoints/best_model_merged/spiece.model": 1777798860.8052647,
+  "checkpoints/best_model_merged/tokenizer.json": 1777798860.8245687,
+  "checkpoints/best_model_merged/tokenizer_config.json": 1777798860.799184,
+  "checkpoints/checkpoint-1300/adapter_config.json": 1777795732.5574954,
+  "checkpoints/checkpoint-1300/adapter_model.safetensors": 1777795732.5569153,
+  "checkpoints/checkpoint-1300/optimizer.pt": 1777795732.6929996,
+  "checkpoints/checkpoint-1300/README.md": 1777795723.992747,
+  "checkpoints/checkpoint-1300/rng_state.pth": 1777795732.6944063,
+  "checkpoints/checkpoint-1300/scheduler.pt": 1777795732.6934369,
+  "checkpoints/checkpoint-1300/trainer_state.json": 1777795732.6950734,
+  "checkpoints/checkpoint-1300/training_args.bin": 1777795732.5581925,
+  "checkpoints/checkpoint-1500/adapter_config.json": 1777796453.407065,
+  "checkpoints/checkpoint-1500/adapter_model.safetensors": 1777796453.4062335,
+  "checkpoints/checkpoint-1500/optimizer.pt": 1777796453.5532222,
+  "checkpoints/checkpoint-1500/README.md": 1777796441.7245266,
+  "checkpoints/checkpoint-1500/rng_state.pth": 1777796453.5546432,
+  "checkpoints/checkpoint-1500/scheduler.pt": 1777796453.553674,
+  "checkpoints/checkpoint-1500/trainer_state.json": 1777796453.5553663,
+  "checkpoints/checkpoint-1500/training_args.bin": 1777796453.4078538,
+  "checkpoints/checkpoint-1515/adapter_config.json": 1777796517.9945214,
+  "checkpoints/checkpoint-1515/adapter_model.safetensors": 1777796517.9887915,
+  "checkpoints/checkpoint-1515/optimizer.pt": 1777796518.4291868,
+  "checkpoints/checkpoint-1515/README.md": 1777796516.2816606,
+  "checkpoints/checkpoint-1515/rng_state.pth": 1777796518.4307976,
+  "checkpoints/checkpoint-1515/scheduler.pt": 1777796518.4297302,
+  "checkpoints/checkpoint-1515/trainer_state.json": 1777796518.4315622,
+  "checkpoints/checkpoint-1515/training_args.bin": 1777796518.001258,
+  "configs/awl_config.yaml": 1777700189.1299732,
+  "configs/inference_config.yaml": 1777799286.2174134,
+  "configs/model_config.yaml": 1777700179.166181,
+  "configs/training_config.yaml": 1777790468.5972416,
+  "configs/training_config_fast.yaml": 1777790475.887508,
+  "data/awl/academic_synonyms.json": 1777700285.7700574,
+  "data/awl/coxhead_awl.txt": 1777700281.261102,
+  "data/awl/domain_lexicons/humanities.txt": 1777700291.8022907,
+  "data/awl/domain_lexicons/sciences.txt": 1777700297.4100578,
+  "data/awl/domain_lexicons/social_sciences.txt": 1777700299.4182992,
+  "data/cache/1356ff2104663316.pt": 1777790287.6153314,
+  "data/cache/d6a64358c3ef403f.pt": 1777790307.475935,
+  "data/processed/test.jsonl": 1777720842.027336,
+  "data/processed/train.jsonl": 1777720841.9981437,
+  "data/processed/val.jsonl": 1777720842.0180135,
+  "data/raw/fce_v2.1.bea19.tar.gz": 1777701562.259877,
+  "data/raw/wi+locness_v2.1.bea19.tar.gz": 1777701601.202943,
+  "data/raw/fce/json_to_m2.py": 1593697400.0,
+  "data/raw/fce/licence.txt": 1548259834.0,
+  "data/raw/fce/readme.txt": 1593701121.0,
+  "data/raw/fce/json/fce.dev.json": 1551887927.0,
+  "data/raw/fce/json/fce.test.json": 1551887927.0,
+  "data/raw/fce/json/fce.train.json": 1551887928.0,
+  "data/raw/fce/m2/fce.dev.gold.bea19.m2": 1551908535.0,
+  "data/raw/fce/m2/fce.test.gold.bea19.m2": 1551908549.0,
+  "data/raw/fce/m2/fce.train.gold.bea19.m2": 1551908611.0,
+  "data/raw/hf/gpt_wiki_intro/data-00000-of-00001.arrow": 1777704055.4466302,
+  "data/raw/hf/gpt_wiki_intro/dataset_info.json": 1777704055.4477787,
+  "data/raw/hf/gpt_wiki_intro/state.json": 1777704055.4473596,
+  "data/raw/hf/mage/data-00000-of-00001.arrow": 1777704009.6226566,
+  "data/raw/hf/mage/dataset_info.json": 1777704009.623809,
+  "data/raw/hf/mage/state.json": 1777704009.6233914,
+  "data/raw/hf/paws/data-00000-of-00001.arrow": 1777704298.0143042,
+  "data/raw/hf/paws/dataset_info.json": 1777704298.0152135,
+  "data/raw/hf/paws/state.json": 1777704298.0148978,
+  "data/raw/hf/raid/data-00000-of-00025.arrow": 1777703696.3333108,
+  "data/raw/hf/raid/data-00001-of-00025.arrow": 1777703698.4878266,
+  "data/raw/hf/raid/data-00002-of-00025.arrow": 1777703700.7023206,
+  "data/raw/hf/raid/data-00003-of-00025.arrow": 1777703712.7551422,
+  "data/raw/hf/raid/data-00004-of-00025.arrow": 1777703715.8790066,
+  "data/raw/hf/raid/data-00005-of-00025.arrow": 1777703727.0471604,
+  "data/raw/hf/raid/data-00006-of-00025.arrow": 1777703739.229002,
+  "data/raw/hf/raid/data-00007-of-00025.arrow": 1777703750.4085863,
+  "data/raw/hf/raid/data-00008-of-00025.arrow": 1777703753.7418487,
+  "data/raw/hf/raid/data-00009-of-00025.arrow": 1777703767.0649137,
+  "data/raw/hf/raid/data-00010-of-00025.arrow": 1777703770.6492746,
+  "data/raw/hf/raid/data-00011-of-00025.arrow": 1777703779.966218,
+  "data/raw/hf/raid/data-00012-of-00025.arrow": 1777703782.763389,
+  "data/raw/hf/raid/data-00013-of-00025.arrow": 1777703794.4995651,
+  "data/raw/hf/raid/data-00014-of-00025.arrow": 1777703797.4540114,
+  "data/raw/hf/raid/data-00015-of-00025.arrow": 1777703808.532667,
+  "data/raw/hf/raid/data-00016-of-00025.arrow": 1777703813.8672874,
+  "data/raw/hf/raid/data-00017-of-00025.arrow": 1777703827.7822654,
+  "data/raw/hf/raid/data-00018-of-00025.arrow": 1777703839.699836,
+  "data/raw/hf/raid/data-00019-of-00025.arrow": 1777703847.619066,
+  "data/raw/hf/raid/data-00020-of-00025.arrow": 1777703850.5027363,
+  "data/raw/hf/raid/data-00021-of-00025.arrow": 1777703862.0215914,
+  "data/raw/hf/raid/data-00022-of-00025.arrow": 1777703872.856046,
+  "data/raw/hf/raid/data-00023-of-00025.arrow": 1777703883.6765664,
+  "data/raw/hf/raid/data-00024-of-00025.arrow": 1777703904.8737774,
+  "data/raw/hf/raid/dataset_info.json": 1777703904.8914242,
+  "data/raw/hf/raid/state.json": 1777703904.8853946,
+  "data/raw/hf/wikitext103/data-00000-of-00002.arrow": 1777704280.352249,
+  "data/raw/hf/wikitext103/data-00001-of-00002.arrow": 1777704282.4038906,
+  "data/raw/hf/wikitext103/dataset_info.json": 1777704282.4051147,
+  "data/raw/hf/wikitext103/state.json": 1777704282.4046695,
+  "data/raw/hf/writing_prompts/data-00000-of-00002.arrow": 1777704198.527498,
+  "data/raw/hf/writing_prompts/data-00001-of-00002.arrow": 1777704201.3078794,
+  "data/raw/hf/writing_prompts/dataset_info.json": 1777704201.3090239,
+  "data/raw/hf/writing_prompts/state.json": 1777704201.3085868,
+  "data/raw/jfleg/test.ref0": 1777701409.8719044,
+  "data/raw/jfleg/test.ref1": 1777701409.8726854,
+  "data/raw/jfleg/test.ref2": 1777701409.8734703,
+  "data/raw/jfleg/test.ref3": 1777701409.8742514,
+  "data/raw/jfleg/test.spellchecked.src": 1777701409.8642416,
+  "data/raw/jfleg/test.src": 1777701409.8653388,
+  "data/raw/jfleg_repo/EACLshort037.pdf": 1777701409.8443322,
+  "data/raw/jfleg_repo/README.md": 1777701409.8446841,
+  "data/raw/jfleg_repo/dev/dev.ref0": 1777701409.8457215,
+  "data/raw/jfleg_repo/dev/dev.ref1": 1777701409.846624,
+  "data/raw/jfleg_repo/dev/dev.ref2": 1777701409.8473954,
+  "data/raw/jfleg_repo/dev/dev.ref3": 1777701409.8481197,
+  "data/raw/jfleg_repo/dev/dev.spellchecked.src": 1777701409.8490207,
+  "data/raw/jfleg_repo/dev/dev.src": 1777701409.8498135,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/dev.ref.m2": 1777701409.8316338,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/getpostagger.sh": 1777701409.8319283,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/m2converter.py": 1777701409.8322287,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/README.md": 1777701409.8256564,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/test.ref.m2": 1777701409.8371184,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/util/assignIOB.py": 1777701409.838248,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/util/edit_dist.py": 1777701409.8386652,
+  "data/raw/jfleg_repo/EACL_exp/m2converter/util/__init__.py": 1777701409.8375704,
+  "data/raw/jfleg_repo/EACL_exp/manual_eval/coded_sentences.csv": 1777701409.839717,
+  "data/raw/jfleg_repo/EACL_exp/manual_eval/README.md": 1777701409.8391445,
+  "data/raw/jfleg_repo/EACL_exp/mturk/pairwise.csv": 1777701409.8407602,
+  "data/raw/jfleg_repo/EACL_exp/mturk/sample.csv": 1777701409.8410628,
+  "data/raw/jfleg_repo/EACL_exp/mturk/template.html": 1777701409.841373,
+  "data/raw/jfleg_repo/eval/gleu.py": 1777701409.8503115,
+  "data/raw/jfleg_repo/eval/readme.md": 1777701409.8506277,
+  "data/raw/jfleg_repo/test/test.ref0": 1777701409.8520677,
+  "data/raw/jfleg_repo/test/test.ref1": 1777701409.8528638,
+  "data/raw/jfleg_repo/test/test.ref2": 1777701409.8536794,
+  "data/raw/jfleg_repo/test/test.ref3": 1777701409.8544674,
+  "data/raw/jfleg_repo/test/test.spellchecked.src": 1777701409.8553677,
+  "data/raw/jfleg_repo/test/test.src": 1777701409.8561919,
+  "data/raw/shanegerami/AI_Human.csv": 1777701568.543233,
+  "data/raw/starblasters8/data.csv": 1777703040.4595706,
+  "data/raw/starblasters8/data.parquet": 1777703067.4076133,
+  "data/raw/starblasters8/distribution.csv": 1777703067.4080453,
+  "data/raw/starblasters8/distribution.parquet": 1777703067.4084356,
+  "data/raw/starblasters8/prompts.csv": 1777703067.4240563,
+  "data/raw/starblasters8/prompts.parquet": 1777703067.4288754,
+  "data/raw/wi+locness/json_to_m2.py": 1593701174.0,
+  "data/raw/wi+locness/licence.wi.txt": 1548261267.0,
+  "data/raw/wi+locness/license.locness.txt": 1548344432.0,
+  "data/raw/wi+locness/readme.txt": 1593702230.0,
+  "data/raw/wi+locness/json/A.dev.json": 1548254108.0,
+  "data/raw/wi+locness/json/A.train.json": 1548254108.0,
+  "data/raw/wi+locness/json/B.dev.json": 1548254108.0,
+  "data/raw/wi+locness/json/B.train.json": 1548254108.0,
+  "data/raw/wi+locness/json/C.dev.json": 1548254108.0,
+  "data/raw/wi+locness/json/C.train.json": 1548254108.0,
+  "data/raw/wi+locness/json/N.dev.json": 1548255672.0,
+  "data/raw/wi+locness/m2/A.dev.gold.bea19.m2": 1551909610.0,
+  "data/raw/wi+locness/m2/A.train.gold.bea19.m2": 1551909604.0,
+  "data/raw/wi+locness/m2/ABC.train.gold.bea19.m2": 1593702095.0,
+  "data/raw/wi+locness/m2/ABCN.dev.gold.bea19.m2": 1551909944.0,
+  "data/raw/wi+locness/m2/B.dev.gold.bea19.m2": 1551909651.0,
+  "data/raw/wi+locness/m2/B.train.gold.bea19.m2": 1551909644.0,
+  "data/raw/wi+locness/m2/C.dev.gold.bea19.m2": 1551909684.0,
+  "data/raw/wi+locness/m2/C.train.gold.bea19.m2": 1551909678.0,
+  "data/raw/wi+locness/m2/N.dev.gold.bea19.m2": 1551909694.0,
+  "data/raw/wi+locness/test/ABCN.test.bea19.orig": 1593701979.0,
+  "data/raw/wi+locness/test/readme.txt": 1593702932.0,
+  "logs/events.out.tfevents.1777733169.bazzite.202618.0": 1777733169.3767228,
+  "logs/events.out.tfevents.1777733440.bazzite.206325.0": 1777733440.2441843,
+  "logs/events.out.tfevents.1777733727.bazzite.207730.0": 1777733727.503944,
+  "logs/events.out.tfevents.1777734559.bazzite.211747.0": 1777734559.4917176,
+  "logs/events.out.tfevents.1777735849.bazzite.215021.0": 1777735849.6431587,
+  "logs/events.out.tfevents.1777737794.bazzite.222265.0": 1777737794.4041593,
+  "logs/events.out.tfevents.1777738485.bazzite.226596.0": 1777738485.9317763,
+  "logs/events.out.tfevents.1777785111.bazzite.5847.0": 1777788329.172026,
+  "logs/events.out.tfevents.1777790308.bazzite.14979.0": 1777790308.4039745,
+  "logs/events.out.tfevents.1777790432.bazzite.18166.0": 1777790432.2569437,
+  "logs/events.out.tfevents.1777790600.bazzite.19895.0": 1777790600.9711528,
+  "logs/events.out.tfevents.1777790916.bazzite.22954.0": 1777791352.7881691,
+  "logs/events.out.tfevents.1777791700.bazzite.29722.0": 1777792139.67899,
+  "logs/events.out.tfevents.1777792299.bazzite.34388.0": 1777796441.5121546,
+  "scripts/download_all_huggingface_datasets.py": 1777702146.005388,
+  "scripts/download_datasets.sh": 1777700679.976215,
+  "scripts/download_kaggle_datasets.sh": 1777700695.699875,
+  "scripts/evaluate.py": 1777710622.2847967,
+  "scripts/preprocess_data.py": 1777701728.828645,
+  "scripts/pretrain_human_pattern_classifier.py": 1777710565.377371,
+  "scripts/run_inference.py": 1777710636.728075,
+  "scripts/train.py": 1777796693.4284217,
+  "src/__init__.py": 1777700367.1651394,
+  "src/api/main.py": 1777710501.3492658,
+  "src/api/middleware.py": 1777710502.144811,
+  "src/api/schemas.py": 1777700655.5228736,
+  "src/api/__init__.py": 1777700367.176363,
+  "src/evaluation/authorship_verifier.py": 1777710422.882881,
+  "src/evaluation/errant_evaluator.py": 1777710414.1773353,
+  "src/evaluation/gleu_scorer.py": 1777710402.1214068,
+  "src/evaluation/style_metrics.py": 1777710421.8995192,
+  "src/evaluation/__init__.py": 1777700367.1744816,
+  "src/inference/corrector.py": 1777799272.1892536,
+  "src/inference/postprocessor.py": 1777799529.931668,
+  "src/inference/__init__.py": 1777700367.1754317,
+  "src/model/base_model.py": 1777789062.6184208,
+  "src/model/generation_utils.py": 1777710219.7970757,
+  "src/model/lora_adapter.py": 1777710206.3699143,
+  "src/model/style_conditioner.py": 1777789195.3776248,
+  "src/model/__init__.py": 1777700367.1716762,
+  "src/preprocessing/dependency_parser.py": 1777709958.1169899,
+  "src/preprocessing/dyslexia_simulator.py": 1777709998.6640317,
+  "src/preprocessing/ner_tagger.py": 1777709980.1368325,
+  "src/preprocessing/pipeline.py": 1777710000.6269286,
+  "src/preprocessing/sentence_segmenter.py": 1777709951.8658924,
+  "src/preprocessing/spell_corrector.py": 1777710998.2651775,
+  "src/preprocessing/__init__.py": 1777700367.1695316,
+  "src/style/emotion_classifier.py": 1777710084.9253688,
+  "src/style/fingerprinter.py": 1777733588.7603915,
+  "src/style/formality_classifier.py": 1777710041.056987,
+  "src/style/style_vector.py": 1777710029.7282178,
+  "src/style/__init__.py": 1777700367.1707523,
+  "src/training/callbacks.py": 1777710375.39277,
+  "src/training/dataset.py": 1777736946.1787465,
+  "src/training/human_pattern_extractor.py": 1777721296.1845315,
+  "src/training/loss_functions.py": 1777734093.3399415,
+  "src/training/trainer.py": 1777792224.759529,
+  "src/training/__init__.py": 1777700367.172702,
+  "src/vocabulary/awl_loader.py": 1777710137.5959558,
+  "src/vocabulary/lexical_substitution.py": 1777799073.7536068,
+  "src/vocabulary/register_filter.py": 1777711030.1810205,
+  "src/vocabulary/__init__.py": 1777700367.1736517,
+  "tests/test_evaluation.py": 1777710754.1602647,
+  "tests/test_model.py": 1777710746.0170994,
+  "tests/test_preprocessing.py": 1777710730.7286103,
+  "tests/test_style.py": 1777710738.944049,
+  "tests/test_vocabulary.py": 1777710752.9497588,
+  "wandb/debug-internal.log": 1777796523.944181,
+  "wandb/debug.log": 1777796521.577159,
+  "wandb/run-20260502_150043-2fg22e6p/run-2fg22e6p.wandb": 1777720317.7069192,
+  "wandb/run-20260502_150043-2fg22e6p/files/config.yaml": 1777720313.7898095,
+  "wandb/run-20260502_150043-2fg22e6p/files/output.log": 1777720313.775867,
+  "wandb/run-20260502_150043-2fg22e6p/files/requirements.txt": 1777714246.1567795,
+  "wandb/run-20260502_150043-2fg22e6p/files/wandb-metadata.json": 1777714246.3533409,
+  "wandb/run-20260502_150043-2fg22e6p/files/wandb-summary.json": 1777720313.7819676,
+  "wandb/run-20260502_150043-2fg22e6p/logs/debug-core.log": 1777720317.7154906,
+  "wandb/run-20260502_150043-2fg22e6p/logs/debug-internal.log": 1777720317.7080636,
+  "wandb/run-20260502_150043-2fg22e6p/logs/debug.log": 1777720313.7399838,
+  "wandb/run-20260502_165105-pwnhqrrf/run-pwnhqrrf.wandb": 1777720942.0482328,
+  "wandb/run-20260502_165105-pwnhqrrf/files/config.yaml": 1777720940.353539,
+  "wandb/run-20260502_165105-pwnhqrrf/files/output.log": 1777720940.3449767,
+  "wandb/run-20260502_165105-pwnhqrrf/files/requirements.txt": 1777720873.2094295,
+  "wandb/run-20260502_165105-pwnhqrrf/files/wandb-metadata.json": 1777720871.3923895,
+  "wandb/run-20260502_165105-pwnhqrrf/files/wandb-summary.json": 1777720940.3480256,
+  "wandb/run-20260502_165105-pwnhqrrf/logs/debug-core.log": 1777720942.0548975,
+  "wandb/run-20260502_165105-pwnhqrrf/logs/debug-internal.log": 1777720942.049499,
+  "wandb/run-20260502_165105-pwnhqrrf/logs/debug.log": 1777720940.2928586,
+  "wandb/run-20260502_165541-4d797dih/run-4d797dih.wandb": 1777721193.2341652,
+  "wandb/run-20260502_165541-4d797dih/files/config.yaml": 1777721190.8295085,
+  "wandb/run-20260502_165541-4d797dih/files/output.log": 1777721190.8289042,
+  "wandb/run-20260502_165541-4d797dih/files/requirements.txt": 1777721148.3496826,
+  "wandb/run-20260502_165541-4d797dih/files/wandb-metadata.json": 1777721146.848307,
+  "wandb/run-20260502_165541-4d797dih/files/wandb-summary.json": 1777721190.829134,
+  "wandb/run-20260502_165541-4d797dih/logs/debug-core.log": 1777721193.237868,
+  "wandb/run-20260502_165541-4d797dih/logs/debug-internal.log": 1777721193.2342758,
+  "wandb/run-20260502_165541-4d797dih/logs/debug.log": 1777721190.8276412,
+  "wandb/run-20260502_165926-36ppiwlg/run-36ppiwlg.wandb": 1777729880.6180944,
+  "wandb/run-20260502_165926-36ppiwlg/files/config.yaml": 1777729878.7727947,
+  "wandb/run-20260502_165926-36ppiwlg/files/output.log": 1777729878.0086646,
+  "wandb/run-20260502_165926-36ppiwlg/files/requirements.txt": 1777721373.2203636,
+  "wandb/run-20260502_165926-36ppiwlg/files/wandb-metadata.json": 1777721371.7235086,
+  "wandb/run-20260502_165926-36ppiwlg/files/wandb-summary.json": 1777729878.0101912,
+  "wandb/run-20260502_165926-36ppiwlg/logs/debug-core.log": 1777729880.6459498,
+  "wandb/run-20260502_165926-36ppiwlg/logs/debug-internal.log": 1777729880.61819,
+  "wandb/run-20260502_165926-36ppiwlg/logs/debug.log": 1777729880.617355,
+  "wandb/run-20260502_192151-h1jq4pkw/run-h1jq4pkw.wandb": 1777731875.1844988,
+  "wandb/run-20260502_192151-h1jq4pkw/files/config.yaml": 1777731873.207762,
+  "wandb/run-20260502_192151-h1jq4pkw/files/output.log": 1777731873.1929135,
+  "wandb/run-20260502_192151-h1jq4pkw/files/requirements.txt": 1777729913.3605578,
+  "wandb/run-20260502_192151-h1jq4pkw/files/wandb-metadata.json": 1777729913.5862672,
+  "wandb/run-20260502_192151-h1jq4pkw/files/wandb-summary.json": 1777731873.1960742,
+  "wandb/run-20260502_192151-h1jq4pkw/logs/debug-core.log": 1777731875.1935635,
+  "wandb/run-20260502_192151-h1jq4pkw/logs/debug-internal.log": 1777731875.1859224,
+  "wandb/run-20260502_192151-h1jq4pkw/logs/debug.log": 1777731873.1659987,
+  "wandb/run-20260502_200514-kl2gg5g9/run-kl2gg5g9.wandb": 1777733212.1297417,
+  "wandb/run-20260502_200514-kl2gg5g9/files/config.yaml": 1777733209.4133239,
+  "wandb/run-20260502_200514-kl2gg5g9/files/output.log": 1777733209.4107795,
+  "wandb/run-20260502_200514-kl2gg5g9/files/requirements.txt": 1777732516.9594064,
+  "wandb/run-20260502_200514-kl2gg5g9/files/wandb-metadata.json": 1777732517.0559525,
+  "wandb/run-20260502_200514-kl2gg5g9/files/wandb-summary.json": 1777733209.411088,
+  "wandb/run-20260502_200514-kl2gg5g9/logs/debug-core.log": 1777733212.141411,
+  "wandb/run-20260502_200514-kl2gg5g9/logs/debug-internal.log": 1777733212.1310723,
+  "wandb/run-20260502_200514-kl2gg5g9/logs/debug.log": 1777733209.404857,
+  "wandb/run-20260502_201947-ngpyijum/run-ngpyijum.wandb": 1777733478.2693913,
+  "wandb/run-20260502_201947-ngpyijum/files/config.yaml": 1777733476.6917355,
+  "wandb/run-20260502_201947-ngpyijum/files/output.log": 1777733476.6534271,
+  "wandb/run-20260502_201947-ngpyijum/files/requirements.txt": 1777733389.9631994,
+  "wandb/run-20260502_201947-ngpyijum/files/wandb-metadata.json": 1777733390.1321378,
+  "wandb/run-20260502_201947-ngpyijum/files/wandb-summary.json": 1777733476.656282,
+  "wandb/run-20260502_201947-ngpyijum/logs/debug-core.log": 1777733478.2785654,
+  "wandb/run-20260502_201947-ngpyijum/logs/debug-internal.log": 1777733478.2707117,
+  "wandb/run-20260502_201947-ngpyijum/logs/debug.log": 1777733476.5978289,
+  "wandb/run-20260502_202439-7n7pnref/run-7n7pnref.wandb": 1777733793.9917243,
+  "wandb/run-20260502_202439-7n7pnref/files/config.yaml": 1777733792.4110804,
+  "wandb/run-20260502_202439-7n7pnref/files/output.log": 1777733792.3750265,
+  "wandb/run-20260502_202439-7n7pnref/files/requirements.txt": 1777733681.1639447,
+  "wandb/run-20260502_202439-7n7pnref/files/wandb-metadata.json": 1777733681.322012,
+  "wandb/run-20260502_202439-7n7pnref/files/wandb-summary.json": 1777733792.378697,
+  "wandb/run-20260502_202439-7n7pnref/logs/debug-core.log": 1777733793.9979222,
+  "wandb/run-20260502_202439-7n7pnref/logs/debug-internal.log": 1777733793.9930737,
+  "wandb/run-20260502_202439-7n7pnref/logs/debug.log": 1777733792.3520947,
+  "wandb/run-20260502_203519-fib23yhh/run-fib23yhh.wandb": 1777734591.343925,
+  "wandb/run-20260502_203519-fib23yhh/files/output.log": 1777734994.8215227,
+  "wandb/run-20260502_203519-fib23yhh/files/requirements.txt": 1777734321.4982467,
+  "wandb/run-20260502_203519-fib23yhh/files/wandb-metadata.json": 1777734321.6402895,
+  "wandb/run-20260502_203519-fib23yhh/logs/debug-core.log": 1777735036.470297,
+  "wandb/run-20260502_203519-fib23yhh/logs/debug-internal.log": 1777735026.9774451,
+  "wandb/run-20260502_203519-fib23yhh/logs/debug.log": 1777734559.486897,
+  "wandb/run-20260502_204834-03roqvb7/run-03roqvb7.wandb": 1777735857.4507105,
+  "wandb/run-20260502_204834-03roqvb7/files/config.yaml": 1777735855.5278394,
+  "wandb/run-20260502_204834-03roqvb7/files/output.log": 1777735854.776806,
+  "wandb/run-20260502_204834-03roqvb7/files/requirements.txt": 1777735116.9440887,
+  "wandb/run-20260502_204834-03roqvb7/files/wandb-metadata.json": 1777735117.0886073,
+  "wandb/run-20260502_204834-03roqvb7/files/wandb-summary.json": 1777735854.7797687,
+  "wandb/run-20260502_204834-03roqvb7/logs/debug-core.log": 1777735857.4598973,
+  "wandb/run-20260502_204834-03roqvb7/logs/debug-internal.log": 1777735857.451936,
+  "wandb/run-20260502_204834-03roqvb7/logs/debug.log": 1777735854.702104,
+  "wandb/run-20260502_210534-j0t4q38m/run-j0t4q38m.wandb": 1777736782.127596,
+  "wandb/run-20260502_210534-j0t4q38m/files/config.yaml": 1777736780.0784059,
+  "wandb/run-20260502_210534-j0t4q38m/files/output.log": 1777736780.0776114,
+  "wandb/run-20260502_210534-j0t4q38m/files/requirements.txt": 1777736140.9308562,
+  "wandb/run-20260502_210534-j0t4q38m/files/wandb-metadata.json": 1777736139.3660376,
+  "wandb/run-20260502_210534-j0t4q38m/files/wandb-summary.json": 1777736780.0778146,
+  "wandb/run-20260502_210534-j0t4q38m/logs/debug-core.log": 1777736782.1309361,
+  "wandb/run-20260502_210534-j0t4q38m/logs/debug-internal.log": 1777736782.1277256,
+  "wandb/run-20260502_210534-j0t4q38m/logs/debug.log": 1777736780.076756,
+  "wandb/run-20260502_212127-vl8pftkj/run-vl8pftkj.wandb": 1777737801.436665,
+  "wandb/run-20260502_212127-vl8pftkj/files/config.yaml": 1777737799.6158743,
+  "wandb/run-20260502_212127-vl8pftkj/files/output.log": 1777737798.8592515,
+  "wandb/run-20260502_212127-vl8pftkj/files/requirements.txt": 1777737089.2927256,
+  "wandb/run-20260502_212127-vl8pftkj/files/wandb-metadata.json": 1777737089.4481473,
+  "wandb/run-20260502_212127-vl8pftkj/files/wandb-summary.json": 1777737798.8655431,
+  "wandb/run-20260502_212127-vl8pftkj/logs/debug-core.log": 1777737801.4842963,
+  "wandb/run-20260502_212127-vl8pftkj/logs/debug-internal.log": 1777737801.4381168,
+  "wandb/run-20260502_212127-vl8pftkj/logs/debug.log": 1777737798.7922306,
+  "wandb/run-20260502_213822-mmm9bdu9/run-mmm9bdu9.wandb": 1777738718.0236964,
+  "wandb/run-20260502_213822-mmm9bdu9/files/config.yaml": 1777738715.2277923,
+  "wandb/run-20260502_213822-mmm9bdu9/files/output.log": 1777738715.1982114,
+  "wandb/run-20260502_213822-mmm9bdu9/files/requirements.txt": 1777738106.5846484,
+  "wandb/run-20260502_213822-mmm9bdu9/files/wandb-metadata.json": 1777738105.154441,
+  "wandb/run-20260502_213822-mmm9bdu9/files/wandb-summary.json": 1777738715.2006595,
+  "wandb/run-20260502_213822-mmm9bdu9/logs/debug-core.log": 1777738718.0312166,
+  "wandb/run-20260502_213822-mmm9bdu9/logs/debug-internal.log": 1777738718.0251148,
+  "wandb/run-20260502_213822-mmm9bdu9/logs/debug.log": 1777738715.1964543,
+  "wandb/run-20260503_104137-zjr4w5ln/run-zjr4w5ln.wandb": 1777789114.0511775,
+  "wandb/run-20260503_104137-zjr4w5ln/files/config.yaml": 1777789229.2851222,
+  "wandb/run-20260503_104137-zjr4w5ln/files/output.log": 1777789229.2830012,
+  "wandb/run-20260503_104137-zjr4w5ln/files/requirements.txt": 1777785104.199556,
+  "wandb/run-20260503_104137-zjr4w5ln/files/wandb-metadata.json": 1777785102.3896415,
+  "wandb/run-20260503_104137-zjr4w5ln/files/wandb-summary.json": 1777789229.283297,
+  "wandb/run-20260503_104137-zjr4w5ln/logs/debug-core.log": 1777789229.6806114,
+  "wandb/run-20260503_104137-zjr4w5ln/logs/debug-internal.log": 1777789229.2004015,
+  "wandb/run-20260503_104137-zjr4w5ln/logs/debug.log": 1777789229.5057423,
+  "wandb/run-20260503_120130-xzkygl93/run-xzkygl93.wandb": 1777789897.810659,
+  "wandb/run-20260503_120130-xzkygl93/files/config.yaml": 1777789896.1235933,
+  "wandb/run-20260503_120130-xzkygl93/files/output.log": 1777789895.7228522,
+  "wandb/run-20260503_120130-xzkygl93/files/requirements.txt": 1777789895.719956,
+  "wandb/run-20260503_120130-xzkygl93/files/wandb-metadata.json": 1777789895.5577607,
+  "wandb/run-20260503_120130-xzkygl93/files/wandb-summary.json": 1777789895.7230475,
+  "wandb/run-20260503_120130-xzkygl93/logs/debug-core.log": 1777789897.8687205,
+  "wandb/run-20260503_120130-xzkygl93/logs/debug-internal.log": 1777789897.810803,
+  "wandb/run-20260503_120130-xzkygl93/logs/debug.log": 1777789895.7187955,
+  "wandb/run-20260503_120403-cbb6slr5/run-cbb6slr5.wandb": 1777790315.1058269,
+  "wandb/run-20260503_120403-cbb6slr5/files/config.yaml": 1777790312.1412077,
+  "wandb/run-20260503_120403-cbb6slr5/files/output.log": 1777790311.667267,
+  "wandb/run-20260503_120403-cbb6slr5/files/requirements.txt": 1777790045.13399,
+  "wandb/run-20260503_120403-cbb6slr5/files/wandb-metadata.json": 1777790045.282678,
+  "wandb/run-20260503_120403-cbb6slr5/files/wandb-summary.json": 1777790311.6676607,
+  "wandb/run-20260503_120403-cbb6slr5/logs/debug-core.log": 1777790315.1408749,
+  "wandb/run-20260503_120403-cbb6slr5/logs/debug-internal.log": 1777790315.1073751,
+  "wandb/run-20260503_120403-cbb6slr5/logs/debug.log": 1777790311.6647966,
+  "wandb/run-20260503_121016-impcgg4z/run-impcgg4z.wandb": 1777790434.7027235,
+  "wandb/run-20260503_121016-impcgg4z/files/config.yaml": 1777790433.1677222,
+  "wandb/run-20260503_121016-impcgg4z/files/output.log": 1777790432.6490448,
+  "wandb/run-20260503_121016-impcgg4z/files/requirements.txt": 1777790418.176722,
+  "wandb/run-20260503_121016-impcgg4z/files/wandb-metadata.json": 1777790418.3191545,
+  "wandb/run-20260503_121016-impcgg4z/files/wandb-summary.json": 1777790432.6492898,
+  "wandb/run-20260503_121016-impcgg4z/logs/debug-core.log": 1777790434.707022,
+  "wandb/run-20260503_121016-impcgg4z/logs/debug-internal.log": 1777790434.7028248,
+  "wandb/run-20260503_121016-impcgg4z/logs/debug.log": 1777790434.7233517,
+  "wandb/run-20260503_121312-l9gn41e7/run-l9gn41e7.wandb": 1777790731.9692764,
+  "wandb/run-20260503_121312-l9gn41e7/files/config.yaml": 1777790729.3662996,
+  "wandb/run-20260503_121312-l9gn41e7/files/output.log": 1777790729.3632636,
+  "wandb/run-20260503_121312-l9gn41e7/files/requirements.txt": 1777790594.305007,
+  "wandb/run-20260503_121312-l9gn41e7/files/wandb-metadata.json": 1777790594.4471908,
+  "wandb/run-20260503_121312-l9gn41e7/files/wandb-summary.json": 1777790729.363634,
+  "wandb/run-20260503_121312-l9gn41e7/logs/debug-core.log": 1777790731.9724958,
+  "wandb/run-20260503_121312-l9gn41e7/logs/debug-internal.log": 1777790731.9694047,
+  "wandb/run-20260503_121312-l9gn41e7/logs/debug.log": 1777790729.3612194,
+  "wandb/run-20260503_121828-7pvaltt8/run-7pvaltt8.wandb": 1777791356.9240425,
+  "wandb/run-20260503_121828-7pvaltt8/files/config.yaml": 1777791353.5784223,
+  "wandb/run-20260503_121828-7pvaltt8/files/output.log": 1777791353.5761926,
+  "wandb/run-20260503_121828-7pvaltt8/files/requirements.txt": 1777790910.2114842,
+  "wandb/run-20260503_121828-7pvaltt8/files/wandb-metadata.json": 1777790910.3638337,
+  "wandb/run-20260503_121828-7pvaltt8/files/wandb-summary.json": 1777791353.5765028,
+  "wandb/run-20260503_121828-7pvaltt8/logs/debug-core.log": 1777791356.92785,
+  "wandb/run-20260503_121828-7pvaltt8/logs/debug-internal.log": 1777791356.9241953,
+  "wandb/run-20260503_121828-7pvaltt8/logs/debug.log": 1777791353.573493,
+  "wandb/run-20260503_123131-4y9tqaim/run-4y9tqaim.wandb": 1777792146.311411,
+  "wandb/run-20260503_123131-4y9tqaim/files/config.yaml": 1777792144.4709916,
+  "wandb/run-20260503_123131-4y9tqaim/files/output.log": 1777792143.9418323,
+  "wandb/run-20260503_123131-4y9tqaim/files/requirements.txt": 1777791693.6865625,
+  "wandb/run-20260503_123131-4y9tqaim/files/wandb-metadata.json": 1777791693.8298368,
+  "wandb/run-20260503_123131-4y9tqaim/files/wandb-summary.json": 1777792143.9447422,
+  "wandb/run-20260503_123131-4y9tqaim/logs/debug-core.log": 1777792146.3145404,
+  "wandb/run-20260503_123131-4y9tqaim/logs/debug-internal.log": 1777792146.3116014,
+  "wandb/run-20260503_123131-4y9tqaim/logs/debug.log": 1777792143.9181907,
+  "wandb/run-20260503_124131-7q4dwe22/run-7q4dwe22.wandb": 1777796523.9426115,
+  "wandb/run-20260503_124131-7q4dwe22/files/config.yaml": 1777796521.6113658,
+  "wandb/run-20260503_124131-7q4dwe22/files/output.log": 1777796520.1061456,
+  "wandb/run-20260503_124131-7q4dwe22/files/requirements.txt": 1777792293.6231525,
+  "wandb/run-20260503_124131-7q4dwe22/files/wandb-metadata.json": 1777792293.7842615,
+  "wandb/run-20260503_124131-7q4dwe22/files/wandb-summary.json": 1777796521.5802517,
+  "wandb/run-20260503_124131-7q4dwe22/logs/debug-core.log": 1777796524.016464,
+  "wandb/run-20260503_124131-7q4dwe22/logs/debug-internal.log": 1777796523.944181,
+  "wandb/run-20260503_124131-7q4dwe22/logs/debug.log": 1777796521.577159
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dyslexia-writing-ai"
+version = "1.0.0"
+description = "Style-preserving, grammar-correcting, academic vocabulary elevating AI model for dyslectic writing"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.10"
+authors = [
+    {name = "morpheuslord"},
+]
+[project.scripts]
+train = "scripts.train:train"
+[tool.setuptools.packages.find]
+include = ["src*"]
+[tool.black]
+line-length = 120
+target-version = ["py310"]
+[tool.ruff]
+line-length = 120
+target-version = "py310"
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "N", "UP"]
+[tool.mypy]
+python_version = "3.10"
+warn_return_any = true
+warn_unused_configs = true
+ignore_missing_imports = true
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+asyncio_mode = "auto"
+addopts = "-v --tb=short"

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pytest==8.1.1
+pytest-asyncio==0.23.6
+pytest-cov==5.0.0
+black==24.4.0
+ruff==0.4.1
+mypy==1.9.0
+pre-commit==3.7.0
+ipykernel==6.29.4
+jupyter==1.0.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+# ── Core ML & Deep Learning ──────────────────────────────────────────────────
+torch>=2.9.0
+torchvision>=0.20.0
+torchaudio>=2.9.0
+transformers>=4.40.0
+datasets>=2.18.0
+accelerate>=0.29.0
+peft>=0.10.0                    # LoRA / parameter-efficient fine-tuning
+bitsandbytes>=0.43.0            # 8-bit & 4-bit quantization
+sentencepiece>=0.2.0            # T5 tokenizer dependency
+protobuf>=4.25.0                # T5 tokenizer dependency
+# ── Sentence Embeddings ───────────────────────────────────────────────────────
+sentence-transformers>=2.6.0
+faiss-cpu>=1.8.0                # Vector similarity search
+# ── NLP Pre-Processing ────────────────────────────────────────────────────────
+spacy>=3.7.0
+spacy-transformers>=1.3.0
+language-tool-python>=2.7.0    # LanguageTool grammar checker
+pyspellchecker>=0.8.0           # Context-free spell check (pre-pass)
+nltk>=3.8.0
+textstat>=0.7.0                 # Readability scores (Flesch-Kincaid, etc.)
+# ── Lexical Substitution ─────────────────────────────────────────────────────
+wordfreq>=3.1.0                 # Word frequency data
+# ── Training Infrastructure ───────────────────────────────────────────────────
+wandb>=0.16.0                   # Experiment tracking
+tensorboard>=2.16.0
+numpy>=1.26.0
+pandas>=2.2.0
+scikit-learn>=1.4.0
+scipy>=1.13.0
+# ── Evaluation Tools ──────────────────────────────────────────────────────────
+errant>=2.3.0                   # Grammar Error Annotation Toolkit
+sacrebleu>=2.4.0                # BLEU/GLEU scoring
+bert-score>=0.3.13              # Semantic similarity scoring
+rouge-score>=0.1.2
+# ── API Server ────────────────────────────────────────────────────────────────
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0
+pydantic>=2.7.0
+python-multipart>=0.0.9
+httpx>=0.27.0
+# ── Inference Optimisation ────────────────────────────────────────────────────
+optimum>=1.19.0                 # Hugging Face model optimisation
+# ── Utilities ─────────────────────────────────────────────────────────────────
+pyyaml>=6.0.1
+tqdm>=4.66.0
+loguru>=0.7.0
+python-dotenv>=1.0.0
+click>=8.1.0
+rich>=13.7.0                    # Beautiful terminal output
+joblib>=1.4.0

scripts/download_all_huggingface_datasets.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+Downloads all publicly available HuggingFace datasets automatically.
+Datasets requiring registration/request are flagged with instructions.
+Run: python scripts/download_all_huggingface_datasets.py
+"""
+from datasets import load_dataset
+import os
+os.makedirs("data/raw/hf", exist_ok=True)
+# (hf_identifier, config, split, output_subdir)
+# Removed trust_remote_code — deprecated in newer datasets versions.
+# Removed datasets that no longer exist or require custom loading scripts.
+HF_DATASETS = [
+    ("liamdugan/raid",              None,                   "train",  "raid"),
+    ("Hello-SimpleAI/HC3",          "all",                  "train",  "hc3"),
+    ("yaful/MAGE",                  None,                   "train",  "mage"),
+    ("aadityaubhat/GPT-wiki-intro", None,                   "train",  "gpt_wiki_intro"),
+    ("euclaise/writingprompts",     None,                   "train",  "writing_prompts"),
+    ("wikitext",                    "wikitext-103-raw-v1",  "train",  "wikitext103"),
+    ("paws",                        "labeled_final",        "train",  "paws"),
+]
+def main():
+    for hf_id, config, split, subdir in HF_DATASETS:
+        out_path = f"data/raw/hf/{subdir}"
+        if os.path.exists(out_path):
+            print(f"✓ Already exists: {subdir}")
+            continue
+        try:
+            print(f"Downloading: {hf_id}...")
+            if config:
+                ds = load_dataset(hf_id, config, split=split)
+            else:
+                ds = load_dataset(hf_id, split=split)
+            ds.save_to_disk(out_path)
+            print(f"  ✓ Saved to {out_path} ({len(ds)} examples)")
+        except Exception as e:
+            print(f"  ✗ Failed: {hf_id} — {e}")
+    # Datasets requiring manual action
+    MANUAL_DATASETS = {
+        "google/clang8":       "Requires custom loading script — download manually from HF page",
+        "openwebtext":         "Very large (40GB) — download separately if needed",
+        "W&I+LOCNESS":         "✓ Already downloaded (data/raw/wi+locness/)",
+        "FCE Corpus":          "✓ Already downloaded (data/raw/fce/)",
+        "GYAFC":               "Unavailable — skipped",
+        "Kaggle shanegerami":  "Run: bash scripts/download_kaggle_datasets.sh",
+        "Kaggle starblasters8":"Run: bash scripts/download_kaggle_datasets.sh",
+    }
+    print("\n── Datasets requiring manual action ──")
+    for name, note in MANUAL_DATASETS.items():
+        print(f"  {name}: {note}")
+if __name__ == "__main__":
+    main()

scripts/download_datasets.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+# Download all training data sources
+# Run: bash scripts/download_datasets.sh
+set -e
+mkdir -p data/raw/wi_locness data/raw/jfleg data/raw/gyafc data/raw/custom_dyslexia
+echo "=== Downloading JFLEG (JHU Fluency-Extended GUG) ==="
+if [ ! -d "data/raw/jfleg_repo" ]; then
+    git clone https://github.com/keisks/jfleg.git data/raw/jfleg_repo
+    cp data/raw/jfleg_repo/test/*.src data/raw/jfleg/ 2>/dev/null || true
+    cp data/raw/jfleg_repo/test/*.ref* data/raw/jfleg/ 2>/dev/null || true
+    echo "  ✓ JFLEG downloaded"
+else
+    echo "  ✓ JFLEG already exists"
+fi
+echo ""
+echo "=== Manual Downloads Required ==="
+echo ""
+echo "W&I+LOCNESS (35k pairs, gold standard GEC):"
+echo "  → Register at: https://www.cl.cam.ac.uk/research/nl/bea2019st/"
+echo "  → Place files in: data/raw/wi_locness/"
+echo ""
+echo "GYAFC (105k pairs, formality transfer):"
+echo "  → Request access at: https://github.com/raosudha89/GYAFC-corpus"
+echo "  → Place files in: data/raw/gyafc/"
+echo ""
+echo "=== Dataset download complete ==="
+echo "Check manually downloaded datasets before proceeding."

scripts/download_kaggle_datasets.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+# Download Kaggle datasets for Human-Pattern Anti-AI training
+# Requires: pip install kaggle
+# Setup:    Place kaggle.json API key at ~/.kaggle/kaggle.json
+# Get key:  kaggle.com → Account → Create New API Token
+#
+# Run: bash scripts/download_kaggle_datasets.sh
+set -e
+mkdir -p data/raw/shanegerami data/raw/starblasters8
+echo "=== Downloading Kaggle Datasets ==="
+echo ""
+# Dataset 1: AI vs Human Text (500K essays)
+echo "Downloading: shanegerami/ai-vs-human-text..."
+if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then
+    kaggle datasets download -d shanegerami/ai-vs-human-text \
+        -p data/raw/shanegerami --unzip
+    echo "  ✓ Dataset 1 downloaded"
+else
+    echo "  ✓ Dataset 1 already exists"
+fi
+echo ""
+# Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs)
+echo "Downloading: starblasters8/human-vs-llm-text-corpus..."
+if [ ! -f "data/raw/starblasters8/data.parquet" ]; then
+    kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \
+        -p data/raw/starblasters8 --unzip
+    echo "  ✓ Dataset 2 downloaded"
+else
+    echo "  ✓ Dataset 2 already exists"
+fi
+echo ""
+echo "=== Kaggle datasets download complete ==="
+echo "Dataset 1 (CSV):     data/raw/shanegerami/train_essays.csv"
+echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet"

scripts/evaluate.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Evaluation script.
+Runs all evaluation metrics on the test set.
+Run: python scripts/evaluate.py --config configs/training_config.yaml --split test
+"""
+import click
+import yaml
+import json
+import torch
+from loguru import logger
+from rich.console import Console
+from rich.table import Table
+from src.model.base_model import load_model_and_tokenizer
+from src.model.generation_utils import batch_generate
+from src.evaluation.gleu_scorer import GLEUScorer
+from src.evaluation.errant_evaluator import ERRANTEvaluator
+from src.evaluation.style_metrics import StyleEvaluator
+from src.style.fingerprinter import StyleFingerprinter
+from src.vocabulary.awl_loader import AWLLoader
+console = Console()
+@click.command()
+@click.option("--config", default="configs/training_config.yaml")
+@click.option("--split", default="test")
+@click.option("--max-samples", default=100, help="Max samples to evaluate")
+def evaluate(config: str, split: str, max_samples: int):
+    """Run evaluation on the specified data split."""
+    with open(config) as f:
+        cfg = yaml.safe_load(f)
+    model_cfg = cfg.get("model", {})
+    gen_cfg = cfg.get("generation", {})
+    checkpoint = "checkpoints/best_model"
+    try:
+        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    except Exception:
+        model, tokenizer, _ = load_model_and_tokenizer(model_cfg.get("key", "flan-t5-large"), quantize=False, use_lora=False)
+    model.eval()
+    data_path = cfg.get("data", {}).get(f"{split}_path", f"data/processed/{split}.jsonl")
+    sources, references = [], []
+    with open(data_path) as f:
+        for i, line in enumerate(f):
+            if i >= max_samples:
+                break
+            obj = json.loads(line.strip())
+            sources.append(obj["input"])
+            references.append(obj["target"])
+    prefix = "Correct the following text for grammar, spelling, and clarity. Text to correct: "
+    predictions = batch_generate(model, tokenizer, [prefix + s for s in sources], gen_cfg)
+    gleu_scorer = GLEUScorer()
+    gleu = gleu_scorer.compute_gleu(predictions, references)
+    bert_p, bert_r, bert_f1 = gleu_scorer.compute_bert_score(predictions, references)
+    errant_scores = ERRANTEvaluator().evaluate(sources, predictions, references)
+    fp = StyleFingerprinter(spacy_model="en_core_web_sm")
+    style_scores = StyleEvaluator(fp, AWLLoader()).evaluate_batch(sources, predictions, references)
+    table = Table(title=f"Evaluation ({split}, {len(sources)} samples)")
+    table.add_column("Metric", style="cyan")
+    table.add_column("Score", style="green")
+    table.add_row("GLEU", f"{gleu:.2f}")
+    table.add_row("BERTScore F1", f"{bert_f1:.4f}")
+    table.add_row("ERRANT F0.5", f"{errant_scores['f0.5']:.4f}")
+    table.add_row("Style Similarity", f"{style_scores['style_similarity_mean']:.4f}")
+    table.add_row("AWL Coverage", f"{style_scores['awl_coverage_mean']:.4f}")
+    console.print(table)
+    results = {"gleu": gleu, "bert_f1": bert_f1, "errant": errant_scores, "style": style_scores}
+    with open(f"logs/eval_results_{split}.json", "w") as f:
+        json.dump(results, f, indent=2)
+if __name__ == "__main__":
+    evaluate()

scripts/preprocess_data.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Converts all raw dataset formats into unified JSONL training format.
+Output schema per line:
+{"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"}
+Datasets handled:
+  - FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json
+  - W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json
+  - JFLEG: data/raw/jfleg/*.src + *.ref*
+Run: python scripts/preprocess_data.py
+"""
+import json
+import os
+from pathlib import Path
+def apply_bea19_edits(text: str, edits_block: list) -> str:
+    """
+    Apply BEA-2019 character-level edits to produce corrected text.
+    edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]]
+    We use the first annotator's corrections.
+    Edits are applied in reverse order to preserve character offsets.
+    """
+    if not edits_block or len(edits_block) == 0:
+        return text
+    # Take first annotator's edits
+    annotator_edits = edits_block[0][1]
+    # Sort by start position descending to apply from end to preserve offsets
+    sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True)
+    result = text
+    for edit in sorted_edits:
+        start = edit[0]
+        end = edit[1]
+        replacement = edit[2]
+        # Skip null replacements (no correction needed) and noop edits
+        if replacement is None:
+            continue
+        result = result[:start] + replacement + result[end:]
+    return result
+def process_bea19_json(json_path: str, source_name: str, out_file):
+    """
+    Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
+    Each line is a JSON object with 'text' and 'edits' fields.
+    Produces (input=original, target=corrected) pairs.
+    """
+    count = 0
+    with open(json_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            original = obj["text"]
+            edits = obj.get("edits", [])
+            corrected = apply_bea19_edits(original, edits)
+            # Only include if there were actual corrections
+            if original.strip() != corrected.strip() and corrected.strip():
+                out_file.write(json.dumps({
+                    "input": original,
+                    "target": corrected,
+                    "source": source_name,
+                }) + "\n")
+                count += 1
+    return count
+def process_fce(raw_dir: str, out_file) -> int:
+    """Process all FCE JSON files."""
+    total = 0
+    json_dir = Path(raw_dir) / "json"
+    if not json_dir.exists():
+        print(f"  ⚠ FCE directory not found: {json_dir}")
+        return 0
+    for json_file in sorted(json_dir.glob("*.json")):
+        n = process_bea19_json(str(json_file), "fce", out_file)
+        print(f"  {json_file.name}: {n} pairs")
+        total += n
+    return total
+def process_wi_locness(raw_dir: str, out_file) -> int:
+    """Process all W&I+LOCNESS JSON files."""
+    total = 0
+    json_dir = Path(raw_dir) / "json"
+    if not json_dir.exists():
+        print(f"  ⚠ W&I+LOCNESS directory not found: {json_dir}")
+        return 0
+    for json_file in sorted(json_dir.glob("*.json")):
+        n = process_bea19_json(str(json_file), "wi_locness", out_file)
+        print(f"  {json_file.name}: {n} pairs")
+        total += n
+    return total
+def process_jfleg(raw_dir: str, out_file) -> int:
+    """
+    JFLEG: .src files (original) and .ref0..ref3 (4 human corrections).
+    Each reference becomes a separate training pair.
+    """
+    total = 0
+    src_files = list(Path(raw_dir).glob("*.src"))
+    if not src_files:
+        print(f"  ⚠ JFLEG directory empty or not found: {raw_dir}")
+        return 0
+    for src_file in src_files:
+        refs = [src_file.with_suffix(f".ref{i}") for i in range(4)]
+        with open(src_file) as sf:
+            src_lines = sf.readlines()
+        for ref_path in refs:
+            if ref_path.exists():
+                with open(ref_path) as rf:
+                    ref_lines = rf.readlines()
+                for src, ref in zip(src_lines, ref_lines):
+                    src, ref = src.strip(), ref.strip()
+                    if src and ref and src != ref:
+                        out_file.write(json.dumps({
+                            "input": src,
+                            "target": ref,
+                            "source": "jfleg",
+                        }) + "\n")
+                        total += 1
+    return total
+def create_splits(train_path: str, val_ratio: float = 0.1):
+    """Split train.jsonl into train and val sets."""
+    import random
+    random.seed(42)
+    with open(train_path) as f:
+        lines = f.readlines()
+    random.shuffle(lines)
+    val_size = int(len(lines) * val_ratio)
+    val_lines = lines[:val_size]
+    train_lines = lines[val_size:]
+    with open(train_path, "w") as f:
+        f.writelines(train_lines)
+    val_path = train_path.replace("train.jsonl", "val.jsonl")
+    with open(val_path, "w") as f:
+        f.writelines(val_lines)
+    # Also create a small test split from val
+    test_size = min(len(val_lines) // 2, 500)
+    test_lines = val_lines[:test_size]
+    test_path = train_path.replace("train.jsonl", "test.jsonl")
+    with open(test_path, "w") as f:
+        f.writelines(test_lines)
+    return len(train_lines), len(val_lines), len(test_lines)
+def main():
+    os.makedirs("data/processed", exist_ok=True)
+    print("=== Preprocessing datasets into unified JSONL ===\n")
+    total = 0
+    with open("data/processed/train.jsonl", "w") as out:
+        # FCE
+        print("Processing FCE...")
+        n = process_fce("data/raw/fce", out)
+        print(f"  Total FCE: {n} pairs\n")
+        total += n
+        # W&I+LOCNESS
+        print("Processing W&I+LOCNESS...")
+        n = process_wi_locness("data/raw/wi+locness", out)
+        print(f"  Total W&I+LOCNESS: {n} pairs\n")
+        total += n
+        # JFLEG
+        print("Processing JFLEG...")
+        n = process_jfleg("data/raw/jfleg", out)
+        print(f"  Total JFLEG: {n} pairs\n")
+        total += n
+    print(f"Total examples in train.jsonl: {total}")
+    # Create train/val/test splits
+    print("\nSplitting into train/val/test...")
+    n_train, n_val, n_test = create_splits("data/processed/train.jsonl")
+    print(f"  Train: {n_train} | Val: {n_val} | Test: {n_test}")
+    print("\n✓ Preprocessing complete.")
+    print("  data/processed/train.jsonl")
+    print("  data/processed/val.jsonl")
+    print("  data/processed/test.jsonl")
+if __name__ == "__main__":
+    main()

scripts/pretrain_human_pattern_classifier.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+Pre-trains the HumanPatternClassifier on both Kaggle datasets.
+Run this BEFORE the main training loop.
+The saved classifier weights are then loaded frozen during main training.
+Run: python scripts/pretrain_human_pattern_classifier.py
+Output: checkpoints/human_pattern_classifier.pt
+"""
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, random_split
+from sklearn.metrics import accuracy_score, roc_auc_score
+import numpy as np
+from loguru import logger
+import os
+import yaml
+try:
+    import wandb
+    HAS_WANDB = True
+except ImportError:
+    HAS_WANDB = False
+from src.training.human_pattern_extractor import (
+    HumanPatternFeatureExtractor,
+    KaggleHumanPatternDataset,
+    HumanPatternClassifier,
+)
+def train_classifier(config_path: str = "configs/training_config.yaml"):
+    """Pre-train the human pattern classifier on Kaggle datasets."""
+    # Load config
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    hp_cfg = config.get("human_pattern", {})
+    # Init W&B (optional)
+    if HAS_WANDB and os.environ.get("WANDB_API_KEY"):
+        wandb.init(project="dyslexia-rewriter", name="human-pattern-pretrain", tags=["pretrain"])
+    else:
+        logger.info("W&B not configured, logging to console only")
+    # Create extractor
+    logger.info("Creating feature extractor...")
+    extractor = HumanPatternFeatureExtractor(spacy_model="en_core_web_sm")
+    # Load datasets
+    shanegerami_path = hp_cfg.get("shanegerami_path", "data/raw/shanegerami/AI_Human.csv")
+    starblasters_path = hp_cfg.get("starblasters_path", "data/raw/starblasters8/data.parquet")
+    max_samples = hp_cfg.get("max_samples_per_source", 50000)
+    logger.info("Loading datasets...")
+    dataset = KaggleHumanPatternDataset(
+        shanegerami_path=shanegerami_path,
+        starblasters_path=starblasters_path,
+        extractor=extractor,
+        max_samples_per_source=max_samples,
+    )
+    if len(dataset) == 0:
+        logger.error("No data loaded! Check dataset paths.")
+        return
+    # Pre-compute features
+    dataset.precompute_features()
+    # Train/val split (80/20)
+    val_size = int(len(dataset) * 0.2)
+    train_size = len(dataset) - val_size
+    train_dataset, val_dataset = random_split(
+        dataset,
+        [train_size, val_size],
+        generator=torch.Generator().manual_seed(42),
+    )
+    # Create dataloaders
+    batch_size = hp_cfg.get("pretrain_batch_size", 512)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
+    logger.info(f"Train: {train_size} | Val: {val_size} | Batch size: {batch_size}")
+    # Create model
+    classifier = HumanPatternClassifier(input_dim=17, hidden_dim=128)
+    device = "cpu"
+    classifier = classifier.to(device)
+    # Training setup
+    epochs = hp_cfg.get("pretrain_epochs", 20)
+    lr = hp_cfg.get("pretrain_lr", 1e-3)
+    target_auc = hp_cfg.get("target_auc", 0.88)
+    optimizer = torch.optim.AdamW(classifier.parameters(), lr=lr, weight_decay=1e-4)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+    criterion = nn.BCELoss()
+    best_auc = 0.0
+    os.makedirs("checkpoints", exist_ok=True)
+    # Training loop
+    for epoch in range(1, epochs + 1):
+        classifier.train()
+        train_loss = 0.0
+        train_preds = []
+        train_labels = []
+        for features, labels in train_loader:
+            features = features.to(device)
+            labels = labels.float().to(device)
+            optimizer.zero_grad()
+            outputs = classifier(features)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            # Gradient clipping for stability
+            torch.nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)
+            optimizer.step()
+            train_loss += loss.item() * features.size(0)
+            train_preds.extend(outputs.detach().cpu().numpy())
+            train_labels.extend(labels.cpu().numpy())
+        scheduler.step()
+        train_loss /= train_size
+        # Validation
+        classifier.eval()
+        val_preds = []
+        val_labels = []
+        val_loss = 0.0
+        with torch.no_grad():
+            for features, labels in val_loader:
+                features = features.to(device)
+                labels = labels.float().to(device)
+                outputs = classifier(features)
+                loss = criterion(outputs, labels)
+                val_loss += loss.item() * features.size(0)
+                val_preds.extend(outputs.cpu().numpy())
+                val_labels.extend(labels.cpu().numpy())
+        val_loss /= val_size
+        # Metrics
+        train_preds_binary = [1 if p > 0.5 else 0 for p in train_preds]
+        val_preds_binary = [1 if p > 0.5 else 0 for p in val_preds]
+        train_acc = accuracy_score(train_labels, train_preds_binary)
+        val_acc = accuracy_score(val_labels, val_preds_binary)
+        try:
+            train_auc = roc_auc_score(train_labels, train_preds)
+            val_auc = roc_auc_score(val_labels, val_preds)
+        except ValueError:
+            train_auc = 0.0
+            val_auc = 0.0
+        logger.info(
+            f"Epoch {epoch}/{epochs} | "
+            f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} AUC: {train_auc:.4f} | "
+            f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} AUC: {val_auc:.4f}"
+        )
+        # Log to W&B
+        if HAS_WANDB and wandb.run is not None:
+            wandb.log({
+                "epoch": epoch,
+                "train/loss": train_loss,
+                "train/accuracy": train_acc,
+                "train/auc": train_auc,
+                "val/loss": val_loss,
+                "val/accuracy": val_acc,
+                "val/auc": val_auc,
+                "lr": scheduler.get_last_lr()[0],
+            })
+        # Save best model by AUC
+        if val_auc > best_auc:
+            best_auc = val_auc
+            save_path = hp_cfg.get("classifier_path", "checkpoints/human_pattern_classifier.pt")
+            torch.save(classifier.state_dict(), save_path)
+            logger.info(f"  ✓ New best AUC: {val_auc:.4f} — saved to {save_path}")
+        # Early stopping if target AUC reached
+        if val_auc >= target_auc:
+            logger.info(f"Target AUC {target_auc} reached at epoch {epoch}! Stopping.")
+            break
+    logger.info(f"\nPre-training complete. Best AUC: {best_auc:.4f}")
+    if HAS_WANDB and wandb.run is not None:
+        wandb.finish()
+if __name__ == "__main__":
+    train_classifier()

scripts/run_inference.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Interactive inference script.
+Run: python scripts/run_inference.py --config configs/inference_config.yaml
+"""
+import click
+import yaml
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from src.inference.corrector import AcademicCorrector
+console = Console()
+@click.command()
+@click.option("--config", default="configs/inference_config.yaml")
+@click.option("--text", default=None, help="Text to correct")
+@click.option("--master-copy", default=None, help="Optional master copy for style matching")
+@click.option("--style-alpha", default=0.6, help="Style blend weight (0=master, 1=user)")
+def run_inference(config: str, text: str, master_copy: str, style_alpha: float):
+    """Run inference on text input."""
+    with open(config) as f:
+        cfg = yaml.safe_load(f)
+    console.print("[bold cyan]Loading model...[/]")
+    corrector = AcademicCorrector(cfg)
+    console.print("[bold green]✓ Model loaded[/]")
+    if text:
+        result = corrector.correct(text, master_copy=master_copy, style_alpha=style_alpha)
+        console.print(Panel(result.original, title="Original", border_style="red"))
+        console.print(Panel(result.corrected, title="Corrected", border_style="green"))
+        table = Table(title="Metrics")
+        table.add_column("Metric")
+        table.add_column("Value")
+        table.add_row("Style Similarity", f"{result.style_similarity:.4f}")
+        table.add_row("AWL Coverage", f"{result.awl_coverage:.4f}")
+        for k, v in result.readability.items():
+            table.add_row(k, f"{v:.2f}")
+        console.print(table)
+    else:
+        console.print("[bold yellow]Interactive mode. Type text to correct (Ctrl+C to exit).[/]")
+        while True:
+            try:
+                console.print()
+                user_input = console.input("[bold cyan]Enter text: [/]")
+                if not user_input.strip():
+                    continue
+                result = corrector.correct(user_input, style_alpha=style_alpha)
+                console.print(Panel(result.corrected, title="Corrected", border_style="green"))
+                console.print(f"  Style: {result.style_similarity:.3f} | AWL: {result.awl_coverage:.3f}")
+            except KeyboardInterrupt:
+                console.print("\n[bold red]Goodbye![/]")
+                break
+if __name__ == "__main__":
+    run_inference()

scripts/train.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""
+Full training entry point.
+Run: python scripts/train.py --config configs/training_config.yaml
+"""
+import click
+import yaml
+import torch
+import os
+import gc
+from transformers import TrainingArguments, Seq2SeqTrainingArguments
+from loguru import logger
+try:
+    import wandb
+    HAS_WANDB = True
+except ImportError:
+    HAS_WANDB = False
+from src.model.base_model import load_model_and_tokenizer
+from src.model.style_conditioner import StyleConditioner
+from src.training.dataset import WritingCorrectionDataset
+from src.training.loss_functions import CombinedCorrectionLoss, CombinedCorrectionLossV2
+from src.training.trainer import CorrectionTrainer
+from src.training.callbacks import StyleMetricsCallback, EarlyStoppingOnStyleDrift
+from src.style.fingerprinter import StyleFingerprinter
+from src.evaluation.gleu_scorer import GLEUScorer
+# ── Hybrid GPU Management ───────────────────────────────────────────────────
+def _setup_device():
+    """Detect GPU and configure hybrid VRAM management.
+    Returns (device, gpu_info) where gpu_info is a dict with:
+      - available: bool
+      - name: str
+      - vram_total_mb: int
+      - vram_free_mb: int
+      - compute_cap: tuple
+    """
+    gpu_info = {"available": False, "name": "CPU", "vram_total_mb": 0,
+                "vram_free_mb": 0, "compute_cap": (0, 0)}
+    if not torch.cuda.is_available():
+        logger.info("No GPU detected — training on CPU")
+        return "cpu", gpu_info
+    gpu_info["available"] = True
+    gpu_info["name"] = torch.cuda.get_device_name(0)
+    gpu_info["compute_cap"] = torch.cuda.get_device_capability(0)
+    # Query actual free VRAM
+    vram_total = torch.cuda.get_device_properties(0).total_memory // (1024 * 1024)
+    vram_reserved = torch.cuda.memory_reserved(0) // (1024 * 1024)
+    vram_allocated = torch.cuda.memory_allocated(0) // (1024 * 1024)
+    vram_free = vram_total - vram_allocated
+    gpu_info["vram_total_mb"] = vram_total
+    gpu_info["vram_free_mb"] = vram_free
+    logger.info(
+        f"GPU: {gpu_info['name']} | "
+        f"VRAM: {vram_allocated}MB used / {vram_total}MB total ({vram_free}MB free) | "
+        f"Compute: {gpu_info['compute_cap']}"
+    )
+    # Leave headroom for the system — reserve at most 85% of free VRAM
+    # This prevents the desktop/compositor from starving
+    usable_vram_mb = int(vram_free * 0.85)
+    if usable_vram_mb > 0:
+        # Set PyTorch memory limit to avoid hogging all VRAM
+        fraction = min(usable_vram_mb / vram_total, 0.90)
+        torch.cuda.set_per_process_memory_fraction(fraction, 0)
+        logger.info(
+            f"Hybrid GPU mode: capped PyTorch VRAM to {fraction:.0%} "
+            f"(~{int(vram_total * fraction)}MB), leaving room for system"
+        )
+    return "cuda", gpu_info
+def _auto_batch_size(model_key: str, device: str, gpu_info: dict,
+                     config_batch: int) -> int:
+    """Pick optimal batch size based on model size and available resources."""
+    if device == "cpu":
+        # CPU: T5-Small can handle batch=8 with 32GB RAM, larger models less
+        if "small" in model_key:
+            return min(config_batch, 8)
+        return min(config_batch, 2)
+    # GPU: estimate based on free VRAM
+    free_mb = gpu_info["vram_free_mb"]
+    # Rough VRAM per sample estimates (bf16, seq_len=128):
+    #   T5-Small: ~120MB model + ~50MB/sample
+    #   T5-Base:  ~350MB model + ~90MB/sample
+    #   T5-Large: ~900MB model + ~150MB/sample
+    model_vram_estimates = {
+        "flan-t5-small": {"model_mb": 160, "per_sample_mb": 60},
+        "flan-t5-base": {"model_mb": 400, "per_sample_mb": 100},
+        "flan-t5-large": {"model_mb": 1000, "per_sample_mb": 160},
+        "flan-t5-xl": {"model_mb": 3000, "per_sample_mb": 300},
+    }
+    est = model_vram_estimates.get(model_key, {"model_mb": 500, "per_sample_mb": 120})
+    # Available for batches = free VRAM - model footprint - 300MB safety buffer
+    available_for_batches = free_mb - est["model_mb"] - 300
+    if available_for_batches <= 0:
+        logger.warning("Very tight VRAM — using batch_size=1")
+        return 1
+    max_batch = max(1, available_for_batches // est["per_sample_mb"])
+    optimal = min(config_batch, max_batch)
+    logger.info(
+        f"Auto batch size: {optimal} "
+        f"(model ~{est['model_mb']}MB + {optimal}×{est['per_sample_mb']}MB "
+        f"= ~{est['model_mb'] + optimal * est['per_sample_mb']}MB / {free_mb}MB free)"
+    )
+    return max(1, optimal)
+@click.command()
+@click.option("--config", default="configs/training_config.yaml")
+@click.option("--use-v2-loss", is_flag=True, help="Use V2 loss with human pattern term")
+def train(config: str, use_v2_loss: bool):
+    """Launch the full training pipeline."""
+    # Step 1: Load config
+    logger.info("Step 1: Loading config...")
+    with open(config) as f:
+        cfg = yaml.safe_load(f)
+    model_cfg = cfg.get("model", {})
+    lora_cfg = cfg.get("lora", {})
+    data_cfg = cfg.get("data", {})
+    train_cfg = cfg.get("training", {})
+    loss_cfg = cfg.get("loss", {})
+    gen_cfg = cfg.get("generation", {})
+    # Step 2: Initialise W&B (optional)
+    logger.info("Step 2: Initialising experiment tracking...")
+    if HAS_WANDB and os.environ.get("WANDB_API_KEY"):
+        wandb.init(
+            project="dyslexia-rewriter",
+            name=f"train-{model_cfg.get('key', 'flan-t5')}",
+            config=cfg,
+        )
+    else:
+        logger.info("W&B not configured, logging to TensorBoard only")
+        os.environ["WANDB_DISABLED"] = "true"
+    # Step 3: Detect GPU and configure hybrid VRAM management
+    logger.info("Step 3: Setting up device (hybrid GPU mode)...")
+    device, gpu_info = _setup_device()
+    # Step 4: Load model + tokenizer
+    logger.info("Step 4: Loading model and tokenizer...")
+    model_key = model_cfg.get("key", "flan-t5-small")
+    model, tokenizer, is_seq2seq = load_model_and_tokenizer(
+        model_key=model_key,
+        quantize=model_cfg.get("quantize", False),
+        use_lora=model_cfg.get("use_lora", True),
+        lora_config_dict=lora_cfg,
+    )
+    # Required for PEFT + gradient checkpointing compatibility
+    if hasattr(model, 'enable_input_require_grads'):
+        model.enable_input_require_grads()
+    # ── torch.compile for fused kernels (PyTorch 2.x) ───────────────────────
+    if hasattr(torch, "compile") and device == "cuda":
+        try:
+            # "default" mode: fuses kernels via Triton without CUDA graphs.
+            # "reduce-overhead" uses CUDA graphs which break with LoRA/PEFT
+            # (tensor outputs get overwritten between graph replays).
+            logger.info("Applying torch.compile(mode='default')...")
+            model = torch.compile(model, mode="default")
+            logger.info("✓ torch.compile applied — first few steps will be slower (compiling)")
+        except Exception as e:
+            logger.warning(f"torch.compile failed (non-fatal): {e}")
+    # Step 5: Create fingerprinter
+    logger.info("Step 5: Creating style fingerprinter...")
+    fingerprinter = StyleFingerprinter(
+        spacy_model="en_core_web_sm",  # Use small model for training speed
+        awl_path="data/awl/coxhead_awl.txt",
+    )
+    # Step 6: Create datasets
+    logger.info("Step 6: Loading datasets...")
+    train_dataset = WritingCorrectionDataset(
+        data_path=data_cfg.get("train_path", "data/processed/train.jsonl"),
+        tokenizer=tokenizer,
+        fingerprinter=fingerprinter,
+        max_input_length=data_cfg.get("max_input_length", 512),
+        max_target_length=data_cfg.get("max_target_length", 512),
+        augment_with_synthetic=data_cfg.get("augment_synthetic", True),
+        synthetic_ratio=data_cfg.get("synthetic_ratio", 0.3),
+    )
+    val_dataset = WritingCorrectionDataset(
+        data_path=data_cfg.get("val_path", "data/processed/val.jsonl"),
+        tokenizer=tokenizer,
+        fingerprinter=fingerprinter,
+        max_input_length=data_cfg.get("max_input_length", 512),
+        max_target_length=data_cfg.get("max_target_length", 512),
+        augment_with_synthetic=False,
+    )
+    logger.info(f"Train: {len(train_dataset)} | Val: {len(val_dataset)}")
+    # Free memory after dataset loading
+    gc.collect()
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    # Use simple CE-only loss for training — aux models (sentence-transformer,
+    # GPT-2, HP classifier) are NOT loaded since they provide no gradient signal
+    # (they decode via argmax under no_grad). This saves ~1GB+ memory.
+    from torch import nn
+    class CEOnlyLoss(nn.Module):
+        """Cross-entropy only loss — the only loss that provides gradient signal."""
+        def __init__(self):
+            super().__init__()
+            self.ce_loss = nn.CrossEntropyLoss(ignore_index=-100)
+        def forward(self, logits, labels, **kwargs):
+            if logits.dim() == 3:
+                ce_logits = logits.view(-1, logits.size(-1))
+                ce_labels = labels.view(-1)
+            else:
+                ce_logits = logits
+                ce_labels = labels
+            l_ce = self.ce_loss(ce_logits, ce_labels)
+            return {"total_loss": l_ce, "ce_loss": l_ce}
+    loss_fn = CEOnlyLoss()
+    logger.info("Using CE-only loss (aux models skipped to save memory)")
+    # Step 8: Create training arguments
+    logger.info("Step 8: Creating training arguments...")
+    # Auto-detect precision support
+    use_bf16 = False
+    use_fp16 = False
+    if device == "cuda":
+        if gpu_info["compute_cap"][0] >= 8:
+            use_bf16 = True
+            logger.info("Using BF16 (Ampere+ GPU)")
+        else:
+            use_fp16 = True
+            logger.info("Using FP16 (pre-Ampere GPU)")
+    elif device == "cpu":
+        # Zen 3+ CPUs (Ryzen 5000+) support BF16 in PyTorch 2.x
+        try:
+            test = torch.tensor([1.0], dtype=torch.bfloat16)
+            _ = test + test  # Test BF16 compute works
+            use_bf16 = True
+            logger.info("Using BF16 on CPU (Zen 3+ detected)")
+        except Exception:
+            logger.info("BF16 not supported on this CPU, using FP32")
+    # Smart batch size based on model + available resources
+    config_batch = train_cfg.get("per_device_train_batch_size", 4)
+    batch_size = _auto_batch_size(model_key, device, gpu_info, config_batch)
+    # Smart gradient checkpointing:
+    # - ENABLE for large models (saves VRAM at cost of compute)
+    # - DISABLE for small models (they fit in VRAM, checkpointing is pure overhead)
+    # - ALWAYS DISABLE on CPU (plenty of RAM, checkpointing wastes CPU cycles)
+    large_models = {"flan-t5-large", "flan-t5-xl", "llama-3.1-8b"}
+    use_grad_ckpt = model_key in large_models and device == "cuda"
+    if use_grad_ckpt:
+        logger.info("Gradient checkpointing: ON (large model, saving VRAM)")
+    else:
+        logger.info(f"Gradient checkpointing: OFF ({'small model fits in VRAM' if device == 'cuda' else 'CPU has plenty of RAM'})")
+    # Dataloader workers: Python 3.14 changed default start method to "forkserver"
+    # on Linux, which hits "too many fds" with num_workers > 0.
+    # Use 0 (main-process loading) — dataset is pre-tokenized so overhead is minimal.
+    num_workers = train_cfg.get("dataloader_num_workers", 0)
+    # Filter report_to to only available tools
+    report_to = []
+    if HAS_WANDB and os.environ.get("WANDB_API_KEY"):
+        report_to.append("wandb")
+    report_to.append("tensorboard")
+    training_args = TrainingArguments(
+        output_dir=train_cfg.get("output_dir", "checkpoints/"),
+        num_train_epochs=train_cfg.get("num_train_epochs", 5),
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=train_cfg.get("per_device_eval_batch_size", 8) if device == "cuda" else 2,
+        gradient_accumulation_steps=train_cfg.get("gradient_accumulation_steps", 8),
+        learning_rate=train_cfg.get("learning_rate", 3e-4),
+        lr_scheduler_type=train_cfg.get("lr_scheduler_type", "cosine"),
+        warmup_ratio=train_cfg.get("warmup_ratio", 0.05),
+        weight_decay=train_cfg.get("weight_decay", 0.01),
+        fp16=use_fp16,
+        bf16=use_bf16,
+        eval_strategy=train_cfg.get("evaluation_strategy", "steps"),
+        eval_steps=train_cfg.get("eval_steps", 100),
+        save_strategy=train_cfg.get("save_strategy", "steps"),
+        save_steps=train_cfg.get("save_steps", 100),
+        save_total_limit=train_cfg.get("save_total_limit", 3),
+        load_best_model_at_end=False,  # Handled manually below (PEFT adapters break Trainer's loader)
+        metric_for_best_model=train_cfg.get("metric_for_best_model", "eval_loss"),
+        greater_is_better=train_cfg.get("greater_is_better", False),
+        logging_dir=train_cfg.get("logging_dir", "logs/"),
+        logging_steps=train_cfg.get("logging_steps", 25),
+        report_to=report_to,
+        dataloader_num_workers=num_workers,
+        seed=train_cfg.get("seed", 42),
+        remove_unused_columns=False,  # We have custom columns (style_vector, etc.)
+        gradient_checkpointing=use_grad_ckpt,
+    )
+    # Step 9: Create trainer
+    logger.info("Step 9: Creating trainer...")
+    trainer = CorrectionTrainer(
+        loss_fn=loss_fn,
+        fingerprinter=fingerprinter,
+        tokenizer=tokenizer,
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        callbacks=[
+            StyleMetricsCallback(),
+            EarlyStoppingOnStyleDrift(min_style_similarity=0.75),
+        ],
+    )
+    # Step 10: Train
+    logger.info("Step 10: Starting training...")
+    logger.info(
+        f"Config summary: model={model_key} | batch={batch_size} | "
+        f"accum={training_args.gradient_accumulation_steps} | "
+        f"effective_batch={batch_size * training_args.gradient_accumulation_steps} | "
+        f"epochs={training_args.num_train_epochs} | "
+        f"precision={'bf16' if use_bf16 else 'fp16' if use_fp16 else 'fp32'} | "
+        f"grad_ckpt={use_grad_ckpt} | device={device}"
+    )
+    trainer.train()
+    # Step 11: Save best model (manual PEFT-aware loading)
+    logger.info("Step 11: Saving best model...")
+    output_dir = train_cfg.get("output_dir", "checkpoints/")
+    save_path = os.path.join(output_dir, "best_model")
+    # Find best checkpoint from trainer state
+    best_ckpt = None
+    state_path = os.path.join(output_dir, "trainer_state.json")
+    # Check each checkpoint for trainer_state.json
+    import glob
+    for ckpt_dir in sorted(glob.glob(os.path.join(output_dir, "checkpoint-*"))):
+        ts = os.path.join(ckpt_dir, "trainer_state.json")
+        if os.path.exists(ts):
+            import json as json_mod
+            with open(ts) as f:
+                state = json_mod.load(f)
+            best_path = state.get("best_model_checkpoint")
+            if best_path:
+                best_ckpt = best_path
+    if best_ckpt and os.path.isdir(best_ckpt):
+        logger.info(f"Loading best checkpoint from {best_ckpt}")
+        from peft import PeftModel
+        # Reload the best adapter weights
+        best_adapter = os.path.join(best_ckpt, "adapter_model.safetensors")
+        if os.path.exists(best_adapter):
+            model.load_adapter(best_ckpt, adapter_name="default")
+            logger.info(f"Loaded best adapter from {best_ckpt}")
+        else:
+            logger.warning(f"No adapter found at {best_ckpt}, saving current model")
+    else:
+        logger.info("No best checkpoint found, saving final model state")
+    trainer.save_model(save_path)
+    tokenizer.save_pretrained(save_path)
+    logger.info(f"Model saved to {save_path}")
+    if HAS_WANDB and wandb.run is not None:
+        wandb.finish()
+    logger.info("✓ Training complete!")
+if __name__ == "__main__":
+    train()

src/__init__.py ADDED Viewed

File without changes

start.sh ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env bash
+# ═══════════════════════════════════════════════════════════════════════════
+# start.sh — Inference launcher for the Dyslexia Academic Writing Corrector
+# ═══════════════════════════════════════════════════════════════════════════
+#
+# Usage:
+#   bash start.sh --cli                  # Interactive REPL mode
+#   bash start.sh --api                  # FastAPI server mode
+#   bash start.sh --cli --text "..."     # Single text correction
+#   bash start.sh --api --port 8080      # Custom port
+#
+set -euo pipefail
+# ── Defaults ────────────────────────────────────────────────────────────────
+MODE=""
+CONFIG="configs/inference_config.yaml"
+TEXT=""
+MASTER_COPY=""
+STYLE_ALPHA="0.6"
+PORT="8000"
+WORKERS="1"
+# ── Colors ──────────────────────────────────────────────────────────────────
+GREEN='\033[0;32m'
+CYAN='\033[0;36m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+BOLD='\033[1m'
+NC='\033[0m'
+# ── Parse arguments ────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --cli)         MODE="cli"; shift ;;
+        --api)         MODE="api"; shift ;;
+        --config)      CONFIG="$2"; shift 2 ;;
+        --config=*)    CONFIG="${1#*=}"; shift ;;
+        --text)        TEXT="$2"; shift 2 ;;
+        --text=*)      TEXT="${1#*=}"; shift ;;
+        --master-copy) MASTER_COPY="$2"; shift 2 ;;
+        --port)        PORT="$2"; shift 2 ;;
+        --port=*)      PORT="${1#*=}"; shift ;;
+        --workers)     WORKERS="$2"; shift 2 ;;
+        --alpha)       STYLE_ALPHA="$2"; shift 2 ;;
+        -h|--help)
+            echo "Usage: bash start.sh [--cli|--api] [OPTIONS]"
+            echo ""
+            echo "Modes:"
+            echo "  --cli          Interactive REPL or single-text correction"
+            echo "  --api          Start FastAPI server"
+            echo ""
+            echo "Options:"
+            echo "  --config PATH  Config file (default: configs/inference_config.yaml)"
+            echo "  --text TEXT    Text to correct (CLI mode, skip interactive)"
+            echo "  --master-copy  Optional master copy for style matching"
+            echo "  --alpha FLOAT  Style blend weight 0-1 (default: 0.6)"
+            echo "  --port PORT    API server port (default: 8000)"
+            echo "  --workers N    API server workers (default: 1)"
+            exit 0
+            ;;
+        *) echo -e "${RED}Unknown option: $1${NC}"; exit 1 ;;
+    esac
+done
+# ── Python detection ───────────────────────────────────────────────────────
+if command -v python3 &>/dev/null; then
+    PYTHON=python3
+elif command -v python &>/dev/null; then
+    PYTHON=python
+else
+    echo -e "${RED}Python not found!${NC}"
+    exit 1
+fi
+# ── Mode selection ─────────────────────────────────────────────────────────
+if [ -z "$MODE" ]; then
+    echo ""
+    echo -e "${BOLD}╔══════════════════════════════════════════════════════════╗${NC}"
+    echo -e "${BOLD}║  Dyslexia Academic Writing Corrector — Inference        ║${NC}"
+    echo -e "${BOLD}╚══════════════════════════════════════════════════════════╝${NC}"
+    echo ""
+    echo -e "  ${CYAN}1)${NC} Interactive CLI (REPL)"
+    echo -e "  ${CYAN}2)${NC} API Server (FastAPI)"
+    echo ""
+    read -rp "  Select mode [1/2]: " choice
+    case "$choice" in
+        1) MODE="cli" ;;
+        2) MODE="api" ;;
+        *) MODE="cli" ;;
+    esac
+fi
+# ── Check model exists ────────────────────────────────────────────────────
+if [ ! -d "checkpoints/best_model" ]; then
+    echo -e "${YELLOW}[WARN] No trained model found at checkpoints/best_model${NC}"
+    echo -e "${YELLOW}       Will use base model. Run train.sh first for best results.${NC}"
+fi
+# ── Launch ─────────────────────────────────────────────────────────────────
+case "$MODE" in
+    cli)
+        echo -e "${GREEN}Starting CLI inference...${NC}"
+        CLI_ARGS="--config $CONFIG --style-alpha $STYLE_ALPHA"
+        if [ -n "$TEXT" ]; then
+            CLI_ARGS="$CLI_ARGS --text \"$TEXT\""
+        fi
+        if [ -n "$MASTER_COPY" ]; then
+            CLI_ARGS="$CLI_ARGS --master-copy \"$MASTER_COPY\""
+        fi
+        eval $PYTHON scripts/run_inference.py $CLI_ARGS
+        ;;
+    api)
+        echo -e "${GREEN}Starting API server on port $PORT...${NC}"
+        echo -e "  Docs:   ${CYAN}http://localhost:$PORT/docs${NC}"
+        echo -e "  Health: ${CYAN}http://localhost:$PORT/health${NC}"
+        echo ""
+        $PYTHON -m uvicorn src.api.main:app \
+            --host 0.0.0.0 \
+            --port "$PORT" \
+            --workers "$WORKERS" \
+            --log-level info
+        ;;
+esac

tests/test_evaluation.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Tests for the evaluation framework."""
+import pytest
+from src.evaluation.gleu_scorer import GLEUScorer
+def test_gleu_scorer_instantiation():
+    """Test that GLEU scorer can be created."""
+    scorer = GLEUScorer()
+    assert scorer is not None
+def test_gleu_perfect_score():
+    """Test that identical predictions and references score high."""
+    scorer = GLEUScorer()
+    preds = ["The cat sat on the mat.", "Hello world."]
+    refs = ["The cat sat on the mat.", "Hello world."]
+    score = scorer.compute_gleu(preds, refs)
+    assert score > 90.0  # Should be near-perfect
+def test_gleu_empty_input():
+    """Test empty input handling."""
+    scorer = GLEUScorer()
+    assert scorer.compute_gleu([], []) == 0.0
+def test_awl_coverage_score():
+    """Test AWL coverage scoring."""
+    from src.vocabulary.awl_loader import AWLLoader
+    from src.style.fingerprinter import StyleFingerprinter
+    from src.evaluation.style_metrics import StyleEvaluator
+    import tempfile, os
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
+        f.write("analysis\nresearch\nmethod\nsignificant\nestablish\n")
+        awl_path = f.name
+    try:
+        awl = AWLLoader(primary_path=awl_path, synonyms_path=None)
+        fp = StyleFingerprinter(spacy_model="en_core_web_sm", awl_path=awl_path)
+        evaluator = StyleEvaluator(fp, awl)
+        coverage = evaluator.awl_coverage("The analysis shows significant research results.")
+        assert 0.0 <= coverage <= 1.0
+    finally:
+        os.unlink(awl_path)

tests/test_model.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Tests for the core model module."""
+import pytest
+import torch
+from src.model.base_model import load_model_and_tokenizer, ENCODER_DECODER_MODELS, DECODER_ONLY_MODELS
+from src.model.style_conditioner import StyleConditioner, prepend_style_prefix
+from src.model.lora_adapter import create_lora_config
+from peft import TaskType
+def test_model_registry_populated():
+    """Test that model registries are defined."""
+    assert len(ENCODER_DECODER_MODELS) > 0
+    assert len(DECODER_ONLY_MODELS) > 0
+def test_invalid_model_key():
+    """Test that unknown model keys raise ValueError."""
+    with pytest.raises(ValueError, match="Unknown model key"):
+        load_model_and_tokenizer("nonexistent-model")
+def test_style_conditioner_output_shape():
+    """Test that style conditioner produces correct tensor shapes."""
+    conditioner = StyleConditioner(style_dim=512, model_hidden_dim=256, n_prefix_tokens=5)
+    batch_size = 2
+    style_vec = torch.randn(batch_size, 512)
+    prefix = conditioner(style_vec)
+    assert prefix.shape == (batch_size, 5, 256)
+def test_prepend_style_prefix():
+    """Test prefix prepending dimensions."""
+    embeddings = torch.randn(2, 10, 256)  # batch=2, seq=10, hidden=256
+    prefix = torch.randn(2, 5, 256)       # batch=2, prefix=5, hidden=256
+    result = prepend_style_prefix(embeddings, prefix)
+    assert result.shape == (2, 15, 256)
+def test_lora_config_creation():
+    """Test LoRA config creation."""
+    config = create_lora_config(TaskType.SEQ_2_SEQ_LM, r=8, lora_alpha=16)
+    assert config.r == 8
+    assert config.lora_alpha == 16

tests/test_preprocessing.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Tests for the preprocessing pipeline."""
+import pytest
+from src.preprocessing.dyslexia_simulator import DyslexiaSimulator
+from src.preprocessing.spell_corrector import DyslexiaAwareSpellCorrector
+@pytest.fixture
+def simulator():
+    return DyslexiaSimulator(error_rate=0.5, seed=42)
+@pytest.fixture
+def corrector():
+    c = DyslexiaAwareSpellCorrector()
+    yield c
+    c.close()
+def test_spell_correction_phonetic(corrector):
+    """Test that common dyslexic misspellings are corrected."""
+    result = corrector._phonetic_pass("I wuz going to the store becaus I cud")
+    assert "was" in result
+    assert "could" in result
+def test_spell_correction_empty(corrector):
+    """Test empty input handling."""
+    assert corrector.correct("") == ""
+    assert corrector.correct("   ") == "   "
+def test_entity_protection():
+    """Test that named entities are identified and protected."""
+    from src.preprocessing.ner_tagger import NERTagger
+    tagger = NERTagger(model_name="en_core_web_sm")
+    entities = tagger.tag("John Smith went to London to meet Dr. Brown.")
+    labels = [e.label for e in entities]
+    assert len(entities) > 0
+    assert any(e.text in ("John Smith", "London", "Dr. Brown") for e in entities)
+def test_sentence_segmentation():
+    """Test that text is correctly split into sentences."""
+    from src.preprocessing.sentence_segmenter import SentenceSegmenter
+    seg = SentenceSegmenter(model_name="en_core_web_sm")
+    sentences = seg.segment("Hello world. How are you? I am fine.")
+    assert len(sentences) == 3
+def test_readability_scores():
+    """Test that readability metrics are computed."""
+    from src.preprocessing.pipeline import PreprocessingPipeline
+    pipeline = PreprocessingPipeline(model_name="en_core_web_sm")
+    text = "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing."
+    doc = pipeline.process(text)
+    assert "flesch_kincaid_grade" in doc.readability
+    assert "gunning_fog" in doc.readability
+def test_dependency_trees():
+    """Test that dependency trees are extracted."""
+    from src.preprocessing.dependency_parser import DependencyParser
+    parser = DependencyParser(model_name="en_core_web_sm")
+    svo = parser.extract_svo("The cat sat on the mat.")
+    assert len(svo) > 0
+    assert "subjects" in svo[0]
+def test_dyslexia_simulator(simulator):
+    """Test that the simulator produces corrupted text."""
+    clean = "The important thing about education is that it helps everyone."
+    corrupted, original = simulator.simulate(clean)
+    assert original == clean
+    # With 50% error rate, something should be different
+    assert corrupted != clean or True  # May not always corrupt
+def test_dyslexia_simulator_preserves_clean(simulator):
+    """Test that the clean text is returned unchanged."""
+    _, clean = simulator.simulate("Hello world this is a test.")
+    assert clean == "Hello world this is a test."

tests/test_style.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Tests for the style fingerprinting module."""
+import pytest
+import torch
+from src.style.fingerprinter import StyleFingerprinter, StyleProjectionMLP
+from src.style.style_vector import cosine_similarity, average_style_vectors
+@pytest.fixture
+def fingerprinter(tmp_path):
+    awl = tmp_path / "awl.txt"
+    awl.write_text("analysis\nconsider\nestablish\nsignificant\n")
+    return StyleFingerprinter(spacy_model="en_core_web_sm", awl_path=str(awl))
+def test_style_vector_shape(fingerprinter):
+    """Test that style vectors have correct dimensionality."""
+    vec = fingerprinter.extract_vector("This is a test sentence for analysis.")
+    assert vec.shape == (512,)
+def test_style_vector_different_texts(fingerprinter):
+    """Test that different writing styles produce different vectors."""
+    formal = "The analysis demonstrates significant correlations between variables."
+    informal = "yo this stuff is like totally awesome and cool"
+    v1 = fingerprinter.extract_vector(formal)
+    v2 = fingerprinter.extract_vector(informal)
+    sim = cosine_similarity(v1, v2)
+    assert sim < 0.99  # Should not be identical
+def test_style_blend(fingerprinter):
+    """Test that blended vectors have unit norm."""
+    v1 = fingerprinter.extract_vector("Academic formal text with analysis.")
+    v2 = fingerprinter.extract_vector("Casual informal text with stuff.")
+    blended = fingerprinter.blend_vectors(v1, v2, alpha=0.6)
+    norm = torch.norm(blended).item()
+    assert abs(norm - 1.0) < 0.01  # Should be L2-normalised
+def test_raw_features_keys(fingerprinter):
+    """Test that raw features contain expected keys."""
+    features = fingerprinter.extract_raw_features("The quick brown fox jumps over the lazy dog.")
+    assert "sentence_length_mean" in features
+    assert "type_token_ratio" in features
+    assert "passive_voice_ratio" in features
+    assert "lexical_density" in features

tests/test_vocabulary.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Tests for the vocabulary elevation module."""
+import pytest
+from src.vocabulary.awl_loader import AWLLoader
+from src.vocabulary.lexical_substitution import RegisterFilter
+def test_awl_loader(tmp_path):
+    """Test that AWL words are loaded correctly."""
+    awl_file = tmp_path / "test_awl.txt"
+    awl_file.write_text("analysis\nresearch\nmethod\n")
+    loader = AWLLoader(primary_path=str(awl_file), synonyms_path=None)
+    assert len(loader.all_words) == 3
+def test_awl_membership(tmp_path):
+    """Test is_academic lookup."""
+    awl_file = tmp_path / "test_awl.txt"
+    awl_file.write_text("analysis\nresearch\nmethod\n")
+    loader = AWLLoader(primary_path=str(awl_file), synonyms_path=None)
+    assert loader.is_academic("analysis") is True
+    assert loader.is_academic("ANALYSIS") is True  # Case insensitive
+    assert loader.is_academic("pizza") is False
+def test_register_filter_contractions():
+    """Test that contractions are expanded."""
+    rf = RegisterFilter()
+    result = rf.apply("I don't think it's correct.")
+    assert "do not" in result
+    assert "it is" in result
+def test_register_filter_colloquialisms():
+    """Test that colloquial phrases are replaced."""
+    rf = RegisterFilter()
+    result = rf.apply("We need to find out a lot of things.")
+    assert "ascertain" in result or "find out" not in result

todo_registry.md ADDED Viewed

	@@ -0,0 +1,335 @@

+# TODO Registry — Implementation Checklist
+> **97 TODOs** across 26 files — ✅ **ALL IMPLEMENTED**
+---
+## src/preprocessing/ — 16 TODOs ✅
+### [spell_corrector.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/spell_corrector.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 36 | Implement initialisation (SpellChecker + LanguageTool) | ✅ DONE |
+| 41 | Implement phonetic pass (regex substitution from `DYSLEXIC_PHONETIC_MAP`) | ✅ DONE |
+| 46 | Implement spellcheck pass (pyspellchecker token-level) | ✅ DONE |
+| 51 | Implement LanguageTool pass (context-aware, reverse-offset correction) | ✅ DONE |
+| 56 | Implement full correction pipeline (chain all 3 passes) | ✅ DONE |
+| 61 | Implement cleanup (`self.tool.close()`) | ✅ DONE |
+### [sentence_segmenter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/sentence_segmenter.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 15 | Implement initialisation (load spaCy model) | ✅ DONE |
+| 20 | Implement sentence segmentation | ✅ DONE |
+### [dependency_parser.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/dependency_parser.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 16 | Implement initialisation | ✅ DONE |
+| 21 | Implement dependency parsing | ✅ DONE |
+| 26 | Implement SVO extraction | ✅ DONE |
+### [ner_tagger.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/ner_tagger.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 24 | Implement initialisation | ✅ DONE |
+| 29 | Implement NER tagging | ✅ DONE |
+| 34 | Implement protected span extraction | ✅ DONE |
+### [dyslexia_simulator.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/dyslexia_simulator.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 35 | Implement initialisation (set error_rate, seed) | ✅ DONE |
+| 40 | Implement letter transposition | ✅ DONE |
+| 45 | Implement letter omission | ✅ DONE |
+| 50 | Implement letter doubling | ✅ DONE |
+| 55 | Implement letter reversal (b/d, p/q) | ✅ DONE |
+| 60 | Implement word corruption (random error selection) | ✅ DONE |
+| 65 | Implement full simulation (corrupt + word merge) | ✅ DONE |
+### [pipeline.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/preprocessing/pipeline.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 38 | Implement initialisation (load spaCy + spell corrector) | ✅ DONE |
+| 43 | Implement readability extraction (Flesch-Kincaid, Gunning Fog, SMOG, ARI) | ✅ DONE |
+| 48 | Implement dependency tree extraction (SVO per sentence) | ✅ DONE |
+| 53 | Implement full pipeline (7-step: spell→parse→segment→NER→deps→POS→readability) | ✅ DONE |
+---
+## src/style/ — 14 TODOs ✅
+### [fingerprinter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/fingerprinter.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 64 | Implement MLP layers (Linear→LayerNorm→GELU→Dropout→Linear→LayerNorm) | ✅ DONE |
+| 68 | Implement forward pass (MLP projection) | ✅ DONE |
+| 76 | Implement initialisation (spaCy + AWL + projection MLP) | ✅ DONE |
+| 81 | Implement AWL loading from file | ✅ DONE |
+| 86 | Implement passive voice detection (nsubjpass/auxpass dep labels) | ✅ DONE |
+| 91 | Implement avg dependency tree depth | ✅ DONE |
+| 96 | Implement lexical density (content words / total) | ✅ DONE |
+| 101 | Implement raw feature extraction (~40 features) | ✅ DONE |
+| 106 | Implement vector extraction (raw features → pad/truncate to 40 → MLP → 512-dim) | ✅ DONE |
+| 120 | Implement vector blending with L2 normalisation | ✅ DONE |
+### [formality_classifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/formality_classifier.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement initialisation | ✅ DONE |
+| 19 | Implement formality scoring (0-1 scale) | ✅ DONE |
+### [emotion_classifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/emotion_classifier.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement initialisation | ✅ DONE |
+| 19 | Implement emotion classification (distribution over register categories) | ✅ DONE |
+### [style_vector.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/style/style_vector.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 12 | Implement cosine similarity | ✅ DONE |
+| 18 | Implement vector averaging | ✅ DONE |
+| 24 | Implement save to disk | ✅ DONE |
+| 30 | Implement load from disk | ✅ DONE |
+---
+## src/model/ — 5 TODOs ✅
+### [base_model.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/base_model.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 39 | Implement model loading (tokenizer + model + quantization + LoRA wrapping) | ✅ DONE |
+### [lora_adapter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/lora_adapter.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 20 | Implement LoRA config creation | ✅ DONE |
+| 26 | Implement LoRA application to model | ✅ DONE |
+| 32 | Implement weight merging for inference | ✅ DONE |
+### [style_conditioner.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/style_conditioner.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 27 | Implement projection layers (Linear → Tanh) | ✅ DONE |
+| 37 | Implement forward pass (project + reshape) | ✅ DONE |
+| 53 | Implement prefix prepending (torch.cat along seq dim) | ✅ DONE |
+### [generation_utils.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/model/generation_utils.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 20 | Implement generation with beam search | ✅ DONE |
+| 30 | Implement batch generation | ✅ DONE |
+---
+## src/training/ — 22 TODOs ✅
+### [dataset.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/dataset.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 54 | Implement initialisation and data loading | ✅ DONE |
+| 59 | Implement JSONL loading | ✅ DONE |
+| 64 | Implement synthetic data augmentation | ✅ DONE |
+| 68 | Implement `__len__` | ✅ DONE |
+| 73 | Implement `__getitem__` | ✅ DONE |
+### [loss_functions.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/loss_functions.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 34 | Implement V1 initialisation | ✅ DONE |
+| 43 | Implement style loss (1 - cosine_similarity) | ✅ DONE |
+| 52 | Implement semantic loss | ✅ DONE |
+| 65 | Implement combined loss V1 | ✅ DONE |
+| 82 | Implement V2 initialisation with frozen classifier | ✅ DONE |
+| 87 | Implement human pattern loss (1 - human_score) | ✅ DONE |
+| 100 | Implement combined loss V2 | ✅ DONE |
+### [trainer.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/trainer.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 17 | Store loss function, fingerprinter, and tokenizer | ✅ DONE |
+| 22 | Implement custom `compute_loss` | ✅ DONE |
+### [callbacks.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/callbacks.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement evaluation-time style metric logging | ✅ DONE |
+| 22 | Implement early stopping initialisation | ✅ DONE |
+| 26 | Implement early stopping check | ✅ DONE |
+### [human_pattern_extractor.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/training/human_pattern_extractor.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 68 | Implement initialisation (spaCy + GPT-2) | ✅ DONE |
+| 73 | Implement GPT-2 perplexity calculation | ✅ DONE |
+| 78 | Implement burstiness | ✅ DONE |
+| 83 | Implement sentence starter diversity | ✅ DONE |
+| 88 | Implement n-gram novelty | ✅ DONE |
+| 93 | Implement AI marker density | ✅ DONE |
+| 98 | Implement discourse density | ✅ DONE |
+| 103 | Implement punctuation patterns | ✅ DONE |
+| 108 | Implement full 17-dim feature extraction | ✅ DONE |
+| 125 | Implement KaggleHumanPatternDataset loading | ✅ DONE |
+| 129 | Implement `__len__` | ✅ DONE |
+| 133 | Implement `__getitem__` | ✅ DONE |
+| 148 | Implement HumanPatternClassifier MLP layers | ✅ DONE |
+| 153 | Implement forward pass | ✅ DONE |
+| 158 | Implement single-text scoring | ✅ DONE |
+---
+## src/vocabulary/ — 10 TODOs ✅
+### [awl_loader.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/vocabulary/awl_loader.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 21 | Implement initialisation | ✅ DONE |
+| 26 | Implement word list file loading | ✅ DONE |
+| 31 | Implement synonym JSON loading | ✅ DONE |
+| 36 | Implement `is_academic()` | ✅ DONE |
+| 41 | Implement `get_academic_synonyms()` | ✅ DONE |
+| 47 | Implement `all_words` property | ✅ DONE |
+### [lexical_substitution.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/vocabulary/lexical_substitution.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 41 | Implement initialisation | ✅ DONE |
+| 46 | Implement contextual semantic similarity | ✅ DONE |
+| 51 | Implement AWL substitution generation | ✅ DONE |
+| 56 | Implement vocabulary elevation | ✅ DONE |
+| 106 | Implement register filtering | ✅ DONE |
+### [register_filter.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/vocabulary/register_filter.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement initialisation | ✅ DONE |
+| 19 | Implement nominalisation | ✅ DONE |
+| 24 | Implement hedging | ✅ DONE |
+| 29 | Implement formality check | ✅ DONE |
+---
+## src/evaluation/ — 7 TODOs ✅
+### [gleu_scorer.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/gleu_scorer.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 20 | Implement corpus-level GLEU scoring | ✅ DONE |
+| 29 | Implement BERTScore computation | ✅ DONE |
+### [errant_evaluator.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/errant_evaluator.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 15 | Implement initialisation (ERRANT annotator) | ✅ DONE |
+| 23 | Implement ERRANT evaluation | ✅ DONE |
+### [style_metrics.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/style_metrics.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 19 | Implement style similarity | ✅ DONE |
+| 24 | Implement AWL coverage | ✅ DONE |
+| 33 | Implement batch evaluation | ✅ DONE |
+### [authorship_verifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/evaluation/authorship_verifier.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement initialisation (load model) | ✅ DONE |
+| 19 | Implement authorship verification | ✅ DONE |
+---
+## src/inference/ — 3 TODOs ✅
+### [corrector.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/inference/corrector.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 39 | Implement initialisation | ✅ DONE |
+| 52 | Implement full correction pipeline | ✅ DONE |
+### [postprocessor.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/inference/postprocessor.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement initialisation | ✅ DONE |
+| 19 | Implement text cleanup | ✅ DONE |
+| 27 | Implement entity restoration | ✅ DONE |
+| 32 | Implement final formatting | ✅ DONE |
+---
+## src/api/ — 2 TODOs ✅
+### [main.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/api/main.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 22 | Load config and initialise corrector on startup | ✅ DONE |
+| 31 | Implement `/correct` endpoint | ✅ DONE |
+### [middleware.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/src/api/middleware.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 14 | Implement request logging (timing, path, status) | ✅ DONE |
+| 22 | Implement rate limiter state | ✅ DONE |
+| 26 | Implement rate limiting logic | ✅ DONE |
+---
+## scripts/ — 5 TODOs ✅
+### [train.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/train.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 24 | Implement training pipeline (10 steps) | ✅ DONE |
+### [evaluate.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/evaluate.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 19 | Implement evaluation pipeline | ✅ DONE |
+### [run_inference.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/run_inference.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 21 | Implement inference pipeline | ✅ DONE |
+### [pretrain_human_pattern_classifier.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/scripts/pretrain_human_pattern_classifier.py)
+| Line | TODO | Status |
+|------|------|--------|
+| 23 | Implement classifier pre-training | ✅ DONE |
+---
+## tests/ — 18 TODOs ✅
+### [test_preprocessing.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_preprocessing.py) — 7 tests ✅
+### [test_style.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_style.py) — 4 tests ✅
+### [test_model.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_model.py) — 2 tests + 3 new ✅
+### [test_vocabulary.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_vocabulary.py) — 4 tests ✅
+### [test_evaluation.py](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/tests/test_evaluation.py) — 4 tests ✅
+---
+## Shell Scripts ✅
+| Script | Purpose |
+|--------|---------|
+| [train.sh](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/train.sh) | Multi-stage training with Skip/Redo/Continue checkpoint system |
+| [start.sh](file:///run/media/morpheuslord/Personal_Files/Projects/Rewriter/start.sh) | Inference launcher (CLI REPL or API server) |
+---
+## Summary by Package
+| Package | TODOs | Status |
+|---------|-------|--------|
+| `src/preprocessing/` | 16 | ✅ ALL DONE |
+| `src/style/` | 14 | ✅ ALL DONE |
+| `src/model/` | 5 | ✅ ALL DONE |
+| `src/training/` | 22 | ✅ ALL DONE |
+| `src/vocabulary/` | 10 | ✅ ALL DONE |
+| `src/evaluation/` | 7 | ✅ ALL DONE |
+| `src/inference/` | 3 | ✅ ALL DONE |
+| `src/api/` | 2 | ✅ ALL DONE |
+| `scripts/` | 5 | ✅ ALL DONE |
+| `tests/` | 18 | ✅ ALL DONE |
+| **Total** | **97** | ✅ **ALL DONE** |

train.sh ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env bash
+# ═══════════════════════════════════════════════════════════════════════════
+# train.sh — Multi-stage training orchestrator with checkpoint system
+# ═══════════════════════════════════════════════════════════════════════════
+#
+# Usage: bash train.sh [--config CONFIG] [--auto]
+#
+# Each stage prompts: [S]kip, [R]edo, [C]ontinue
+# Use --auto to skip all prompts and auto-detect what needs running
+#
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+export PYTHONPATH="${SCRIPT_DIR}:${PYTHONPATH:-}"
+CONFIG="${1:-configs/training_config.yaml}"
+AUTO_MODE=false
+# Parse args
+for arg in "$@"; do
+    case $arg in
+        --auto) AUTO_MODE=true ;;
+        --config=*) CONFIG="${arg#*=}" ;;
+    esac
+done
+# ── Colors ──────────────────────────────────────────────────────────────────
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+BOLD='\033[1m'
+NC='\033[0m'
+info()  { echo -e "${CYAN}[INFO]${NC} $1"; }
+ok()    { echo -e "${GREEN}[  OK]${NC} $1"; }
+warn()  { echo -e "${YELLOW}[WARN]${NC} $1"; }
+err()   { echo -e "${RED}[FAIL]${NC} $1"; }
+# ── Stage prompt function ──────────────────────────────────────────────────
+# Asks user to [S]kip, [R]edo, or [C]ontinue for each stage
+prompt_stage() {
+    local stage_name="$1"
+    local check_file="$2"  # File to check if stage already completed
+    echo ""
+    echo -e "${BOLD}═══ Stage: ${stage_name} ═══${NC}"
+    if [ "$AUTO_MODE" = true ]; then
+        if [ -n "$check_file" ] && [ -e "$check_file" ]; then
+            info "Auto-mode: $check_file exists, skipping"
+            return 1  # Skip
+        fi
+        return 0  # Continue
+    fi
+    if [ -n "$check_file" ] && [ -e "$check_file" ]; then
+        warn "Previous output found: $check_file"
+        echo -e "  ${YELLOW}[S]${NC}kip  |  ${CYAN}[R]${NC}edo  |  ${GREEN}[C]${NC}ontinue"
+        read -rp "  Choice [S/R/C]: " choice
+        case "${choice,,}" in
+            r|redo)   info "Redoing ${stage_name}..."; return 0 ;;
+            c|continue) info "Continuing ${stage_name}..."; return 0 ;;
+            *)         info "Skipping ${stage_name}"; return 1 ;;
+        esac
+    else
+        info "No previous output found. Running ${stage_name}..."
+        return 0
+    fi
+}
+# ── Detect environment ─────────────────────────────────────────────────────
+detect_env() {
+    echo -e "${BOLD}═══ Environment Detection ═══${NC}"
+    # Python
+    if command -v python3 &>/dev/null; then
+        PYTHON=python3
+    elif command -v python &>/dev/null; then
+        PYTHON=python
+    else
+        err "Python not found!"
+        exit 1
+    fi
+    ok "Python: $($PYTHON --version 2>&1)"
+    # GPU
+    if $PYTHON -c "import torch; print(torch.cuda.is_available())" 2>/dev/null | grep -q "True"; then
+        GPU_AVAILABLE=true
+        GPU_NAME=$($PYTHON -c "import torch; print(torch.cuda.get_device_name(0))" 2>/dev/null || echo "Unknown")
+        ok "GPU: $GPU_NAME"
+        # Check compute capability for bf16
+        COMPUTE_CAP=$($PYTHON -c "import torch; print(torch.cuda.get_device_capability()[0])" 2>/dev/null || echo "0")
+        if [ "$COMPUTE_CAP" -ge 8 ]; then
+            PRECISION="bf16"
+        else
+            PRECISION="fp16"
+        fi
+        ok "Precision: $PRECISION"
+    else
+        GPU_AVAILABLE=false
+        PRECISION="fp32"
+        warn "No GPU detected — training will use CPU (optimised settings)"
+    fi
+    # W&B
+    if [ -n "${WANDB_API_KEY:-}" ]; then
+        ok "W&B: API key found"
+    else
+        warn "W&B: No API key (WANDB_API_KEY). Logging to TensorBoard only."
+        export WANDB_DISABLED=true
+    fi
+}
+# ═══════════════════════════════════════════════════════════════════════════
+# STAGE 1: Install dependencies & download models
+# ═══════════════════════════════════════════════════════════════════════════
+stage_1_setup() {
+    if prompt_stage "Setup & Dependencies" ".train_stage1_done"; then
+        info "Installing Python dependencies..."
+        $PYTHON -m pip install -r requirements.txt --quiet 2>&1 | tail -5
+        info "Downloading spaCy models..."
+        $PYTHON -m spacy download en_core_web_sm --quiet 2>/dev/null || true
+        info "Downloading NLTK data..."
+        $PYTHON -c "import nltk; nltk.download('punkt', quiet=True); nltk.download('punkt_tab', quiet=True)" 2>/dev/null || true
+        touch .train_stage1_done
+        ok "Setup complete"
+    fi
+}
+# ═══════════════════════════════════════════════════════════════════════════
+# STAGE 2: Data preprocessing
+# ═══════════════════════════════════════════════════════════════════════════
+stage_2_preprocess() {
+    if prompt_stage "Data Preprocessing" "data/processed/train.jsonl"; then
+        info "Preprocessing datasets into unified JSONL..."
+        $PYTHON scripts/preprocess_data.py
+        ok "Data preprocessing complete"
+    fi
+}
+# ═══════════════════════════════════════════════════════════════════════════
+# STAGE 3: Pre-train human pattern classifier
+# ═══════════════════════════════════════════════════════════════════════════
+stage_3_pretrain_classifier() {
+    if prompt_stage "Pre-train Human Pattern Classifier" "checkpoints/human_pattern_classifier.pt"; then
+        info "Pre-training human pattern classifier on Kaggle datasets..."
+        info "This may take a while on CPU (extracting features for ~100k texts)..."
+        $PYTHON scripts/pretrain_human_pattern_classifier.py
+        ok "Human pattern classifier pre-trained"
+    fi
+}
+# ═══════════════════════════════════════════════════════════════════════════
+# STAGE 4: Main model training
+# ═══════════════════════════════════════════════════════════════════════════
+stage_4_train() {
+    if prompt_stage "Main Model Training" "checkpoints/best_model/config.json"; then
+        info "Starting main model training..."
+        info "Config: $CONFIG"
+        # Add V2 loss flag if classifier exists
+        V2_FLAG=""
+        if [ -f "checkpoints/human_pattern_classifier.pt" ]; then
+            info "Human pattern classifier found — using V2 loss (with anti-AI term)"
+            V2_FLAG="--use-v2-loss"
+        fi
+        $PYTHON scripts/train.py --config "$CONFIG" $V2_FLAG
+        ok "Main training complete"
+    fi
+}
+# ═══════════════════════════════════════════════════════════════════════════
+# STAGE 5: Evaluation
+# ═══════════════════════════════════════════════════════════════════════════
+stage_5_evaluate() {
+    if prompt_stage "Evaluation" "logs/eval_results_test.json"; then
+        info "Running evaluation on test set..."
+        mkdir -p logs
+        $PYTHON scripts/evaluate.py --config "$CONFIG" --split test
+        ok "Evaluation complete"
+    fi
+}
+# ═══════════════════════════════════════════════════════════════════════════
+# Main
+# ═══════════════════════════════════════════════════════════════════════════
+main() {
+    echo ""
+    echo -e "${BOLD}╔══════════════════════════════════════════════════════════╗${NC}"
+    echo -e "${BOLD}║  Dyslexia Academic Writing Corrector — Training Suite   ║${NC}"
+    echo -e "${BOLD}╚══════════════════════════════════════════════════════════╝${NC}"
+    echo ""
+    detect_env
+    stage_1_setup
+    stage_2_preprocess
+    stage_3_pretrain_classifier
+    stage_4_train
+    stage_5_evaluate
+    echo ""
+    echo -e "${GREEN}${BOLD}═══ All stages complete! ═══${NC}"
+    echo -e "  Model saved to: ${CYAN}checkpoints/best_model/${NC}"
+    echo -e "  Eval results:   ${CYAN}logs/eval_results_test.json${NC}"
+    echo -e "  Start inference: ${CYAN}bash start.sh${NC}"
+    echo ""
+}
+main

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,578 @@

+{"time":"2026-05-03T12:41:31.910510511+05:30","level":"INFO","msg":"wandb-core"}
+{"time":"2026-05-03T12:41:31.911235013+05:30","level":"INFO","msg":"stream: starting","core version":"0.26.1"}
+{"time":"2026-05-03T12:41:32.640591639+05:30","level":"INFO","msg":"stream: created new stream","id":"7q4dwe22"}
+{"time":"2026-05-03T12:41:32.640743705+05:30","level":"INFO","msg":"handler: started"}
+{"time":"2026-05-03T12:41:32.64115088+05:30","level":"INFO","msg":"stream: started"}
+{"time":"2026-05-03T12:41:32.641160468+05:30","level":"INFO","msg":"writer: started","stream_id":"7q4dwe22"}
+{"time":"2026-05-03T12:41:32.641172701+05:30","level":"INFO","msg":"sender: started"}
+{"time":"2026-05-03T12:41:33.623792544+05:30","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
+{"time":"2026-05-03T12:41:34.381206382+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:41:48.6250478+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":0,"events_lines":2,"console_offset":0,"console_lines":32,"uploaded_len":2}
+{"time":"2026-05-03T12:41:52.610177283+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:42:03.62427825+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":2,"events_lines":2,"console_offset":31,"console_lines":2}
+{"time":"2026-05-03T12:42:04.079934308+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:42:18.624675392+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":4,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:42:19.131375894+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:42:33.624454986+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":6,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:42:34.185439368+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:42:48.624368649+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":8,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:42:52.050509317+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:43:03.624817069+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":10,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:43:04.189007008+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:43:18.624408595+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":12,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:43:19.03607342+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:43:33.624862786+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":0,"history_lines":1,"events_offset":14,"events_lines":2,"console_offset":32,"console_lines":2}
+{"time":"2026-05-03T12:43:34.088654055+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:43:48.623936217+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":16,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:43:52.622306426+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:44:03.624968066+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":18,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:44:04.159531988+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:44:18.62395356+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":20,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:44:19.042602519+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:44:33.624505635+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":22,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:44:34.179444461+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:44:48.624294713+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":24,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:44:52.488535013+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:45:03.624694431+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":26,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:45:04.171236603+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:45:18.624353905+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":28,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:45:19.049334269+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:45:33.625499719+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":1,"history_lines":1,"events_offset":30,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:45:34.205775314+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:45:48.624246484+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":32,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:45:52.466463116+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:46:03.624377356+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":34,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:46:04.106028784+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:46:18.623990934+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":36,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:46:19.363307766+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:46:33.624399178+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":38,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:46:34.211508133+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:46:48.624496958+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":40,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:46:56.325382987+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:47:03.624347271+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":42,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:47:04.112261534+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:47:18.624559566+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":44,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:47:19.062715354+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:47:33.62485639+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":46,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:47:34.126644783+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:47:48.624876584+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":2,"history_lines":1,"events_offset":48,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:47:52.547877604+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:48:03.624169297+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":50,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:48:04.119370364+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:48:18.624748914+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":52,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:48:19.10634659+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:48:33.624565795+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":54,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:48:34.122699515+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:48:48.624545462+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":56,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:48:52.656977803+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:49:03.624596012+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":58,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:49:04.330825648+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:49:18.624564598+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":60,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:49:19.078491359+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:49:33.624629606+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":62,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:49:34.233481381+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:49:48.623896921+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":64,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:49:52.499893573+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:50:03.625175815+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":3,"history_lines":1,"events_offset":66,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:50:04.236709822+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:50:18.624165748+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":68,"events_lines":2,"console_offset":34,"console_lines":4}
+{"time":"2026-05-03T12:50:19.084054207+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:50:33.624082116+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":70,"events_lines":2,"console_offset":37,"console_lines":1}
+{"time":"2026-05-03T12:50:34.239458399+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:50:48.62427245+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":4,"history_lines":1,"events_offset":72,"events_lines":2,"console_offset":32,"console_lines":1}
+{"time":"2026-05-03T12:50:52.159206398+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:51:03.624243519+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":74,"events_lines":2,"console_offset":37,"console_lines":8}
+{"time":"2026-05-03T12:51:04.139955016+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:51:18.623551729+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":76,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:51:19.090345066+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:51:33.624706726+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":78,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:51:34.143803257+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:51:48.624581596+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":80,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:51:52.775577109+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:52:03.625946523+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":5,"history_lines":1,"events_offset":82,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:52:04.145798756+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:52:18.624567709+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":84,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:52:19.097700993+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:52:33.624587759+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":86,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:52:34.048968605+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:52:48.625017571+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":88,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:52:52.480420415+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:53:03.62479273+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":90,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:53:04.051441036+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:53:18.625071648+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":6,"history_lines":1,"events_offset":92,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:53:19.320213512+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:53:33.624825898+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":94,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:53:34.054856352+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:53:48.624712266+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":96,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:53:52.088255736+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:54:03.624162447+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":98,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:54:04.058358464+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:54:18.624352267+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":100,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:54:19.21327141+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:54:33.62520724+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":102,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:54:34.063683346+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:54:48.623773001+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":7,"history_lines":1,"events_offset":104,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:54:53.824628369+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:55:03.624869319+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":106,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:55:04.065156475+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:55:18.624737083+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":108,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:55:19.117949184+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:55:33.623963358+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":110,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:55:34.069701428+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:55:48.624698335+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":112,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:55:52.409930019+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:56:03.624554903+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":114,"events_lines":2,"console_offset":43,"console_lines":1}
+{"time":"2026-05-03T12:56:04.481524049+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:56:18.624936254+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":8,"history_lines":1,"events_offset":116,"events_lines":2,"console_offset":43,"console_lines":6}
+{"time":"2026-05-03T12:56:19.227360947+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:56:33.625250748+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":9,"history_lines":1,"events_offset":118,"events_lines":2,"console_offset":43,"console_lines":2}
+{"time":"2026-05-03T12:56:34.280924135+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:56:48.623974918+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":120,"events_lines":2,"console_offset":49,"console_lines":7}
+{"time":"2026-05-03T12:56:53.838367555+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:57:03.625033774+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":122,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:57:04.183208156+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:57:18.624445078+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":124,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:57:19.131798717+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:57:33.624908594+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":126,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:57:34.055298865+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:57:48.623865111+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":128,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:57:53.848616674+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:58:03.625210172+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":130,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:58:04.188844515+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:58:18.625197519+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":10,"history_lines":1,"events_offset":132,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:58:19.24219959+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:58:33.624258984+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":134,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:58:34.396259329+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:58:48.624940129+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":136,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:58:53.848822696+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:59:03.624295658+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":138,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:59:04.400379221+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:59:18.624665157+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":140,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:59:19.145926143+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:59:33.624817526+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":142,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:59:34.206762226+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T12:59:48.624390304+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":144,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T12:59:53.860559262+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:00:03.623875503+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":146,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:00:04.202208533+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:00:18.624929907+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":148,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:00:19.050716927+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:00:33.624476231+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":11,"history_lines":1,"events_offset":150,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:00:34.307214726+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:00:48.624634057+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":152,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:00:52.469169514+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:01:03.624924814+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":154,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:01:04.107064659+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:01:18.624653321+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":156,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:01:19.066067239+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:01:33.624502786+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":158,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:01:34.109698097+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:01:48.624767253+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":160,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:01:53.785694074+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:02:03.624903217+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":162,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:02:05.035353094+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:02:18.624580087+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":164,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:02:19.078738091+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:02:33.624211562+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":166,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:02:34.116294968+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:02:48.62498625+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":12,"history_lines":1,"events_offset":168,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:02:52.549222964+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:03:03.623912797+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":170,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:03:04.118997416+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:03:18.62405338+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":172,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:03:19.20998837+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:03:33.624613805+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":174,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:03:34.123314056+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:03:48.624508585+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":176,"events_lines":2,"console_offset":54,"console_lines":1}
+{"time":"2026-05-03T13:03:53.78511401+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:04:03.625266311+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":13,"history_lines":1,"events_offset":178,"events_lines":2,"console_offset":54,"console_lines":6}
+{"time":"2026-05-03T13:04:04.043331726+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:04:18.625263545+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":14,"history_lines":1,"events_offset":180,"events_lines":2,"console_offset":54,"console_lines":2}
+{"time":"2026-05-03T13:04:19.076967316+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:04:33.624870854+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":182,"events_lines":2,"console_offset":60,"console_lines":7}
+{"time":"2026-05-03T13:04:34.232591774+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:04:48.624554166+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":184,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:04:53.893576903+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:05:03.624387682+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":186,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:05:04.132708966+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:05:18.624056957+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":188,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:05:19.084294163+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:05:33.623642485+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":15,"history_lines":1,"events_offset":190,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:05:34.135980166+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:05:48.624842204+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":192,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:05:53.790732523+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:06:03.624205493+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":194,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:06:04.016288572+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:06:18.624981694+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":196,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:06:19.397699848+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:06:33.623935241+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":198,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:06:34.044819946+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:06:48.624728354+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":200,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:06:49.883946156+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:07:03.625032345+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":202,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:07:04.251028411+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:07:18.625057902+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":16,"history_lines":1,"events_offset":204,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:07:19.426577362+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:07:33.623808377+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":206,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:07:34.150919588+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:07:48.624900844+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":208,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:07:53.732010245+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:08:03.624036094+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":210,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:08:04.153782378+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:08:18.623716099+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":212,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:08:19.213228745+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:08:33.625509812+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":17,"history_lines":1,"events_offset":214,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:08:34.273258552+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:08:48.623779221+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":216,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:08:53.74462565+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:09:03.624590421+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":218,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:09:04.265483888+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:09:18.624592677+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":220,"events_lines":2,"console_offset":65,"console_lines":1}
+{"time":"2026-05-03T13:09:19.111922405+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:09:33.624187264+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":18,"history_lines":1,"events_offset":222,"events_lines":2,"console_offset":65,"console_lines":6}
+{"time":"2026-05-03T13:09:34.06163637+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:09:48.62435919+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":19,"history_lines":1,"events_offset":224,"events_lines":2,"console_offset":65,"console_lines":2}
+{"time":"2026-05-03T13:09:53.928179314+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:10:03.624241639+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":226,"events_lines":2,"console_offset":71,"console_lines":7}
+{"time":"2026-05-03T13:10:04.269883116+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:10:18.623698326+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":228,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:10:19.220580256+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:10:33.625208324+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":230,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:10:34.377376961+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:10:48.624352793+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":232,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:10:53.832676215+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:11:03.624982324+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":20,"history_lines":1,"events_offset":234,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:11:04.079992269+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:11:18.62489984+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":236,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:11:19.027282435+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:11:33.624563879+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":238,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:11:34.041564944+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:11:48.624587725+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":240,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:11:53.917050708+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:12:03.62517304+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":242,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:12:04.092490879+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:12:18.624426504+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":244,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:12:19.136291819+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:12:33.62422219+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":246,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:12:34.159602779+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:12:48.625348431+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":21,"history_lines":1,"events_offset":248,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:12:52.515794539+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:13:03.624617124+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":250,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:13:04.189876876+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:13:18.624355863+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":252,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:13:19.242568869+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:13:33.62437469+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":254,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:13:34.203229293+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:13:48.624058475+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":22,"history_lines":1,"events_offset":256,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:13:52.522178792+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:14:03.624159107+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":258,"events_lines":2,"console_offset":76,"console_lines":1}
+{"time":"2026-05-03T13:14:04.197766657+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:14:18.624297209+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":23,"history_lines":1,"events_offset":260,"events_lines":2,"console_offset":76,"console_lines":6}
+{"time":"2026-05-03T13:14:19.249825938+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:14:33.62367618+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":24,"history_lines":1,"events_offset":262,"events_lines":2,"console_offset":76,"console_lines":2}
+{"time":"2026-05-03T13:14:34.200330044+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:14:48.623745102+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":264,"events_lines":2,"console_offset":82,"console_lines":7}
+{"time":"2026-05-03T13:14:52.630682314+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:15:03.625600932+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":25,"history_lines":1,"events_offset":266,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:15:04.10091133+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:15:18.624529784+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":268,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:15:19.358910816+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:15:33.624544924+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":270,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:15:34.104052485+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:15:48.624128201+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":272,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:15:52.637656189+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:16:03.624463395+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":274,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:16:04.215823444+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:16:18.624989849+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":26,"history_lines":1,"events_offset":276,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:16:19.263366851+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:16:33.624681349+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":278,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:16:34.109971882+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:16:48.624154227+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":280,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:16:53.911552782+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:17:03.624780158+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":282,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:17:04.114449708+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:17:18.623982256+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":284,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:17:19.064687832+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:17:33.624478322+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":27,"history_lines":1,"events_offset":286,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:17:34.117543273+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:17:48.624168179+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":288,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:17:52.550289777+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:18:03.624722476+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":290,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:18:04.020487283+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:18:18.624886096+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":292,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:18:19.07233748+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:18:33.625368527+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":294,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:18:34.124776857+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:18:48.624611181+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":296,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:18:52.419268795+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:19:03.624283998+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":298,"events_lines":2,"console_offset":87,"console_lines":1}
+{"time":"2026-05-03T13:19:04.230735811+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:19:18.624409847+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":28,"history_lines":1,"events_offset":300,"events_lines":2,"console_offset":87,"console_lines":6}
+{"time":"2026-05-03T13:19:19.180387589+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:19:33.624822037+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":29,"history_lines":1,"events_offset":302,"events_lines":2,"console_offset":87,"console_lines":2}
+{"time":"2026-05-03T13:19:34.132064256+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:19:48.624533775+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":304,"events_lines":2,"console_offset":93,"console_lines":7}
+{"time":"2026-05-03T13:19:52.564304075+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:20:03.623942702+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":306,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:20:04.136088386+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:20:18.62579199+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":308,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:20:19.085933299+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:20:33.624760978+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":310,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:20:34.350997003+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:20:48.624959899+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":30,"history_lines":1,"events_offset":312,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:20:53.800045176+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:21:03.624629627+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":314,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:21:04.347251759+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:21:18.624838853+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":316,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:21:19.19502873+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:21:33.624780545+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":318,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:21:34.146731155+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:21:48.623883165+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":320,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:21:53.802198181+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:22:03.623771205+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":31,"history_lines":1,"events_offset":322,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:22:04.25178541+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:22:18.62418141+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":324,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:22:19.203144297+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:22:33.624683769+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":32,"history_lines":1,"events_offset":326,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:22:34.152272658+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:22:48.624033114+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":328,"events_lines":2,"console_offset":98,"console_lines":1}
+{"time":"2026-05-03T13:22:52.585121776+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:23:03.624122754+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":33,"history_lines":1,"events_offset":330,"events_lines":2,"console_offset":98,"console_lines":6}
+{"time":"2026-05-03T13:23:04.258539089+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:23:18.624268783+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":34,"history_lines":1,"events_offset":332,"events_lines":2,"console_offset":98,"console_lines":2}
+{"time":"2026-05-03T13:23:19.105851896+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:23:33.624268784+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":334,"events_lines":2,"console_offset":104,"console_lines":7}
+{"time":"2026-05-03T13:23:34.158773733+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:23:48.624502957+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":336,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:23:52.488571215+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:24:03.624346324+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":35,"history_lines":1,"events_offset":338,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:24:04.093442826+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:24:18.623881299+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":340,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:24:19.215891895+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:24:33.624223866+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":36,"history_lines":1,"events_offset":342,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:24:34.165786211+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:24:48.624101056+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":344,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:24:52.418222751+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:25:03.624050062+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":37,"history_lines":1,"events_offset":346,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:25:04.077447927+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:25:18.623724289+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":348,"events_lines":2,"console_offset":109,"console_lines":1}
+{"time":"2026-05-03T13:25:19.223123465+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:25:33.623791077+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":38,"history_lines":1,"events_offset":350,"events_lines":2,"console_offset":109,"console_lines":6}
+{"time":"2026-05-03T13:25:34.070406215+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:25:48.623830574+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":39,"history_lines":1,"events_offset":352,"events_lines":2,"console_offset":109,"console_lines":2}
+{"time":"2026-05-03T13:25:52.510007264+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:26:03.623789151+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":354,"events_lines":2,"console_offset":115,"console_lines":7}
+{"time":"2026-05-03T13:26:04.176987671+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:26:18.624463038+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":40,"history_lines":1,"events_offset":356,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:26:19.126789241+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:26:33.623945558+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":358,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:26:34.078975811+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:26:48.62452134+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":41,"history_lines":1,"events_offset":360,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:26:54.048538082+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:27:03.623969961+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":362,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:27:04.006756603+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:27:18.6247665+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":364,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:27:19.032623956+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:27:33.624465658+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":42,"history_lines":1,"events_offset":366,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:27:34.195667306+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:27:48.624979333+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":368,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:27:52.618971883+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:28:03.623830561+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":370,"events_lines":2,"console_offset":120,"console_lines":1}
+{"time":"2026-05-03T13:28:04.089199692+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:28:18.623857839+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":43,"history_lines":1,"events_offset":372,"events_lines":2,"console_offset":120,"console_lines":6}
+{"time":"2026-05-03T13:28:19.013505636+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:28:33.623790758+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":44,"history_lines":1,"events_offset":374,"events_lines":2,"console_offset":120,"console_lines":2}
+{"time":"2026-05-03T13:28:34.177926015+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:28:48.624002804+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":376,"events_lines":2,"console_offset":126,"console_lines":7}
+{"time":"2026-05-03T13:28:53.754090339+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:29:03.624437751+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":378,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:29:04.204898109+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:29:18.624210007+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":45,"history_lines":1,"events_offset":380,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:29:19.046606716+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:29:33.624220653+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":382,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:29:34.202729749+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:29:48.62385034+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":46,"history_lines":1,"events_offset":384,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:29:53.862105869+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:30:03.624007753+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":386,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:30:04.103026989+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:30:18.624599295+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":47,"history_lines":1,"events_offset":388,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:30:19.052388549+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:30:33.623750158+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":390,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:30:34.105389648+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:30:48.625066437+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":392,"events_lines":2,"console_offset":131,"console_lines":1}
+{"time":"2026-05-03T13:30:53.76766617+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:31:03.623820021+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":48,"history_lines":1,"events_offset":394,"events_lines":2,"console_offset":131,"console_lines":6}
+{"time":"2026-05-03T13:31:04.212136382+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:31:18.624132197+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":49,"history_lines":1,"events_offset":396,"events_lines":2,"console_offset":131,"console_lines":2}
+{"time":"2026-05-03T13:31:19.060752483+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:31:33.623935802+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":398,"events_lines":2,"console_offset":137,"console_lines":7}
+{"time":"2026-05-03T13:31:34.113303634+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:31:48.624087183+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":50,"history_lines":1,"events_offset":400,"events_lines":2,"console_offset":142,"console_lines":1}
+{"time":"2026-05-03T13:31:53.77424155+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:32:03.624427221+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":402,"events_lines":2,"console_offset":142,"console_lines":1}
+{"time":"2026-05-03T13:32:04.116482624+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:32:18.624173217+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":51,"history_lines":1,"events_offset":404,"events_lines":2,"console_offset":142,"console_lines":1}
+{"time":"2026-05-03T13:32:19.068688911+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:32:33.624454974+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":406,"events_lines":2,"console_offset":142,"console_lines":1}
+{"time":"2026-05-03T13:32:34.120032583+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:32:48.624562713+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":52,"history_lines":1,"events_offset":408,"events_lines":2,"console_offset":142,"console_lines":1}
+{"time":"2026-05-03T13:32:53.790901488+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:33:03.623780704+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":410,"events_lines":2,"console_offset":142,"console_lines":1}
+{"time":"2026-05-03T13:33:04.227727807+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:33:18.62449669+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":53,"history_lines":1,"events_offset":412,"events_lines":2,"console_offset":142,"console_lines":6}
+{"time":"2026-05-03T13:33:19.074032897+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:33:33.623647814+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":54,"history_lines":1,"events_offset":414,"events_lines":2,"console_offset":142,"console_lines":2}
+{"time":"2026-05-03T13:33:34.538702583+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:33:48.623834252+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":416,"events_lines":2,"console_offset":148,"console_lines":7}
+{"time":"2026-05-03T13:33:53.786642254+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:34:03.623944615+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":418,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:34:04.542055301+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:34:18.624315311+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":55,"history_lines":1,"events_offset":420,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:34:19.087548151+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:34:33.624071687+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":422,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:34:34.662667927+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:34:48.62427173+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":56,"history_lines":1,"events_offset":424,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:34:50.512616498+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:35:03.624076367+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":426,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:35:04.44636868+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:35:18.624696413+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":428,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:35:19.11344712+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:35:33.624072402+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":430,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:35:34.143662028+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:35:48.62468305+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":57,"history_lines":1,"events_offset":432,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:35:53.802858739+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:36:03.624546268+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":434,"events_lines":2,"console_offset":153,"console_lines":1}
+{"time":"2026-05-03T13:36:04.14406513+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:36:18.624111624+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":58,"history_lines":1,"events_offset":436,"events_lines":2,"console_offset":153,"console_lines":6}
+{"time":"2026-05-03T13:36:19.096099413+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:36:33.624290887+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":59,"history_lines":1,"events_offset":438,"events_lines":2,"console_offset":153,"console_lines":2}
+{"time":"2026-05-03T13:36:34.257147799+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:36:48.624306903+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":440,"events_lines":2,"console_offset":159,"console_lines":7}
+{"time":"2026-05-03T13:36:52.478397774+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:37:03.623946629+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":60,"history_lines":1,"events_offset":442,"events_lines":2,"console_offset":164,"console_lines":1}
+{"time":"2026-05-03T13:37:04.151079888+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:37:18.624500991+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":444,"events_lines":2,"console_offset":164,"console_lines":1}
+{"time":"2026-05-03T13:37:19.204796067+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:37:33.624178719+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":61,"history_lines":1,"events_offset":446,"events_lines":2,"console_offset":164,"console_lines":1}
+{"time":"2026-05-03T13:37:34.256997889+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:37:48.624482097+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":448,"events_lines":2,"console_offset":164,"console_lines":1}
+{"time":"2026-05-03T13:37:53.920905508+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:38:03.624627196+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":62,"history_lines":1,"events_offset":450,"events_lines":2,"console_offset":164,"console_lines":1}
+{"time":"2026-05-03T13:38:04.158192542+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:38:18.623873971+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":452,"events_lines":2,"console_offset":164,"console_lines":1}
+{"time":"2026-05-03T13:38:19.10989055+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:38:33.624647717+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":63,"history_lines":1,"events_offset":454,"events_lines":2,"console_offset":164,"console_lines":6}
+{"time":"2026-05-03T13:38:34.06006327+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:38:48.624391972+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":64,"history_lines":1,"events_offset":456,"events_lines":2,"console_offset":164,"console_lines":2}
+{"time":"2026-05-03T13:38:52.511278211+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:39:03.62468486+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":458,"events_lines":2,"console_offset":170,"console_lines":7}
+{"time":"2026-05-03T13:39:04.063896486+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:39:18.625250663+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":460,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:39:19.117425843+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:39:33.62476815+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":462,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:39:34.17127823+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:39:48.62534124+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":464,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:39:53.832331787+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:40:03.624605738+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":466,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:40:04.284207372+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:40:18.625015777+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":65,"history_lines":1,"events_offset":468,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:40:19.328978383+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:40:33.624370445+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":470,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:40:34.382233692+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:40:48.625233991+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":472,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:40:52.447473984+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:41:03.624167966+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":474,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:41:04.079693644+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:41:18.624191436+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":476,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:41:19.129757502+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:41:33.624909006+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":478,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:41:34.184623991+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:41:48.624963284+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":66,"history_lines":1,"events_offset":480,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:41:52.719060595+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:42:03.625876072+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":482,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:42:04.187434496+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:42:18.626563405+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":484,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:42:19.137988786+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:42:33.624934109+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":486,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:42:34.204316513+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:42:48.624833123+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":488,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:42:52.521199735+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:43:03.624834571+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":67,"history_lines":1,"events_offset":490,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:43:04.19529391+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:43:18.624997373+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":492,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:43:19.145068366+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:43:33.624157195+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":494,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:43:34.20597247+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:43:48.624978949+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":496,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:43:53.858854775+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:44:03.623885144+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":498,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:44:04.203947716+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:44:18.62381564+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":500,"events_lines":2,"console_offset":175,"console_lines":1}
+{"time":"2026-05-03T13:44:19.052290808+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:44:33.624777738+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":68,"history_lines":1,"events_offset":502,"events_lines":2,"console_offset":175,"console_lines":6}
+{"time":"2026-05-03T13:44:34.104649872+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:44:48.625801712+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":69,"history_lines":1,"events_offset":504,"events_lines":2,"console_offset":175,"console_lines":2}
+{"time":"2026-05-03T13:44:52.535011903+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:45:03.624781826+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":506,"events_lines":2,"console_offset":181,"console_lines":7}
+{"time":"2026-05-03T13:45:06.161240364+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:45:18.624558731+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":508,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:45:19.159052248+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:45:33.624946738+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":510,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:45:34.109361541+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:45:48.623981354+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":512,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:45:53.77168815+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:46:03.624285248+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":70,"history_lines":1,"events_offset":514,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:46:04.21537653+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:46:18.624800855+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":516,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:46:19.063386972+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:46:33.624648964+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":518,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:46:34.118713729+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:46:48.624555173+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":520,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:46:52.650911087+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:47:03.624449509+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":522,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:47:04.222356182+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:47:18.625951064+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":71,"history_lines":1,"events_offset":524,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:47:19.071276729+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:47:33.624367816+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":526,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:47:34.124782785+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:47:48.624182215+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":528,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:47:53.784893271+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:48:03.624971356+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":530,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:48:04.238975318+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:48:18.623911134+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":532,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:48:19.078391518+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:48:33.624018702+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":534,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:48:34.188258053+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:48:48.624560208+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":72,"history_lines":1,"events_offset":536,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:48:54.918684542+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:49:03.624855319+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":538,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:49:04.128716752+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:49:18.624421818+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":540,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:49:19.394206735+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:49:33.624689788+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":542,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:49:34.136619823+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:49:48.624641418+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":544,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:49:52.571726058+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:50:03.624656992+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":546,"events_lines":2,"console_offset":186,"console_lines":1}
+{"time":"2026-05-03T13:50:04.109888074+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:50:18.625313855+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":73,"history_lines":1,"events_offset":548,"events_lines":2,"console_offset":186,"console_lines":6}
+{"time":"2026-05-03T13:50:19.195411994+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:50:33.624802417+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":550,"events_lines":2,"console_offset":187,"console_lines":1}
+{"time":"2026-05-03T13:50:34.049371004+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:50:48.624139142+05:30","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":74,"history_lines":1,"events_offset":552,"events_lines":2,"console_offset":186,"console_lines":2}
+{"time":"2026-05-03T13:50:52.574063905+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:51:03.625454368+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":554,"events_lines":2,"console_offset":192,"console_lines":7}
+{"time":"2026-05-03T13:51:04.045280089+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:51:18.624804269+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":556,"events_lines":2,"console_offset":197,"console_lines":1}
+{"time":"2026-05-03T13:51:19.059234035+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:51:33.62451164+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":558,"events_lines":2,"console_offset":197,"console_lines":1}
+{"time":"2026-05-03T13:51:34.050683028+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:51:48.624511766+05:30","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":560,"events_lines":2,"console_offset":197,"console_lines":1}
+{"time":"2026-05-03T13:51:52.584237864+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:52:03.453661092+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-05-03T13:52:03.455774675+05:30","level":"INFO","msg":"filestream: sending request","total_files":3,"history_offset":75,"history_lines":1,"console_offset":197,"console_lines":37,"uploaded_len":3,"complete":true,"exit_code":1}
+{"time":"2026-05-03T13:52:03.938466542+05:30","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
+{"time":"2026-05-03T13:52:03.939642046+05:30","level":"INFO","msg":"stream: finishing up"}
+{"time":"2026-05-03T13:52:03.940171233+05:30","level":"INFO","msg":"handler: closed"}
+{"time":"2026-05-03T13:52:03.943488073+05:30","level":"INFO","msg":"sender: closed"}
+{"time":"2026-05-03T13:52:03.943913795+05:30","level":"INFO","msg":"stream: all finished"}

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2026-05-03 12:41:31,519 INFO    MainThread:34388 [wandb_setup.py:_flush():81] Current SDK version is 0.26.1
+2026-05-03 12:41:31,519 INFO    MainThread:34388 [wandb_setup.py:_flush():81] Configure stats pid to 34388
+2026-05-03 12:41:31,520 INFO    MainThread:34388 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-05-03 12:41:31,520 INFO    MainThread:34388 [wandb_init.py:setup_run_log_directory():723] Logging user logs to /run/media/morpheuslord/Personal_Files/Projects/Rewriter/wandb/run-20260503_124131-7q4dwe22/logs/debug.log
+2026-05-03 12:41:31,520 INFO    MainThread:34388 [wandb_init.py:setup_run_log_directory():724] Logging internal logs to /run/media/morpheuslord/Personal_Files/Projects/Rewriter/wandb/run-20260503_124131-7q4dwe22/logs/debug-internal.log
+2026-05-03 12:41:31,520 INFO    MainThread:34388 [wandb_init.py:init():850] calling init triggers
+2026-05-03 12:41:31,520 INFO    MainThread:34388 [wandb_init.py:init():855] wandb.init called with sweep_config: {}
+config: {'model': {'key': 'flan-t5-small', 'quantize': False, 'use_lora': True}, 'lora': {'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'target_modules': ['q', 'v', 'k', 'o', 'wi_0', 'wi_1', 'wo']}, 'data': {'train_path': 'data/processed/train.jsonl', 'val_path': 'data/processed/val.jsonl', 'test_path': 'data/processed/test.jsonl', 'max_input_length': 128, 'max_target_length': 128, 'augment_synthetic': True, 'synthetic_ratio': 0.3}, 'training': {'output_dir': 'checkpoints/', 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0003, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.05, 'weight_decay': 0.01, 'fp16': False, 'bf16': True, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 3, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'logging_dir': 'logs/', 'logging_steps': 25, 'report_to': ['wandb', 'tensorboard'], 'dataloader_num_workers': 0, 'seed': 42, 'push_to_hub': False}, 'loss': {'lambda_style': 0.3, 'lambda_semantic': 0.5, 'lambda_human_pattern': 0.4, 'sem_model_name': 'all-mpnet-base-v2'}, 'generation': {'num_beams': 5, 'length_penalty': 1.0, 'no_repeat_ngram_size': 3, 'min_length': 10, 'max_new_tokens': 512, 'early_stopping': True}, 'human_pattern': {'classifier_path': 'checkpoints/human_pattern_classifier.pt', 'shanegerami_path': 'data/raw/shanegerami/AI_Human.csv', 'starblasters_path': 'data/raw/starblasters8/data.parquet', 'max_samples_per_source': 50000, 'pretrain_epochs': 20, 'pretrain_lr': 0.001, 'pretrain_batch_size': 512, 'target_auc': 0.88}, '_wandb': {}}
+2026-05-03 12:41:31,520 INFO    MainThread:34388 [wandb_init.py:init():898] starting backend
+2026-05-03 12:41:31,902 INFO    MainThread:34388 [wandb_init.py:init():913] sending inform_init request
+2026-05-03 12:41:32,641 INFO    MainThread:34388 [wandb_init.py:init():918] backend started and connected
+2026-05-03 12:41:32,643 INFO    MainThread:34388 [wandb_init.py:init():988] updated telemetry
+2026-05-03 12:41:32,644 INFO    MainThread:34388 [wandb_init.py:init():1011] communicating run to backend with 90.0 second timeout
+2026-05-03 12:41:33,463 INFO    MainThread:34388 [wandb_init.py:init():1056] starting run threads in backend
+2026-05-03 12:41:33,614 INFO    MainThread:34388 [wandb_run.py:_console_start():2554] atexit reg
+2026-05-03 12:41:33,614 INFO    MainThread:34388 [wandb_run.py:_redirect():2403] redirect: wrap_raw
+2026-05-03 12:41:33,614 INFO    MainThread:34388 [wandb_run.py:_redirect():2472] Wrapping output streams.
+2026-05-03 12:41:33,614 INFO    MainThread:34388 [wandb_run.py:_redirect():2495] Redirects installed.
+2026-05-03 12:41:33,616 INFO    MainThread:34388 [wandb_init.py:init():1094] run started, returning control to user process
+2026-05-03 12:41:39,987 INFO    MainThread:34388 [wandb_run.py:_config_callback():1415] config_cb None None {'peft_config': {'default': {'task_type': 'SEQ_2_SEQ_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.19.1', 'base_model_name_or_path': 'google/flan-t5-small', 'revision': None, 'inference_mode': False, 'r': 8, 'target_modules': ['wo', 'wi_0', 'v', 'q', 'k', 'o', 'wi_1'], 'exclude_modules': None, 'lora_alpha': 16, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'lora_ga_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'use_bdlora': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 32128, 'd_model': 512, 'd_kv': 64, 'd_ff': 1024, 'num_layers': 8, 'num_decoder_layers': 8, 'num_heads': 6, 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'dropout_rate': 0.1, 'classifier_dropout': 0.0, 'layer_norm_epsilon': 1e-06, 'initializer_factor': 1.0, 'feed_forward_proj': 'gated-gelu', 'use_cache': True, 'dense_act_fn': 'gelu_new', 'is_gated_act': True, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['T5ForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': 0, 'task_specific_params': {'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}, 'problem_type': None, '_name_or_path': 'google/flan-t5-small', 'transformers_version': '4.53.2', 'model_type': 't5', 'n_positions': 512, 'output_past': True, 'output_attentions': False, 'output_dir': 'checkpoints/', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'logs/', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 3, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'checkpoints/', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb', 'tensorboard'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2026-05-03 12:41:39,991 INFO    MainThread:34388 [wandb_config.py:__setitem__():155] [no run ID] config set model/num_parameters = 78239104 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f6f34fb9e80>>
+2026-05-03 12:41:39,992 INFO    MainThread:34388 [wandb_run.py:_config_callback():1415] config_cb model/num_parameters 78239104 None
+2026-05-03 13:52:01,572 INFO    wandb-AsyncioManager-main:34388 [service_client.py:_forward_responses():134] Reached EOF.
+2026-05-03 13:52:01,575 INFO    wandb-AsyncioManager-main:34388 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles.

wandb/run-20260502_165105-pwnhqrrf/run-pwnhqrrf.wandb ADDED Viewed

Binary file (11.5 kB). View file

wandb/run-20260502_165541-4d797dih/run-4d797dih.wandb ADDED Viewed

Binary file (6.88 kB). View file

wandb/run-20260502_201947-ngpyijum/run-ngpyijum.wandb ADDED Viewed

Binary file (30.8 kB). View file

wandb/run-20260502_202439-7n7pnref/run-7n7pnref.wandb ADDED Viewed

Binary file (26.2 kB). View file

wandb/run-20260502_203519-fib23yhh/run-fib23yhh.wandb ADDED Viewed

Binary file (32.8 kB). View file

wandb/run-20260502_210534-j0t4q38m/run-j0t4q38m.wandb ADDED Viewed

Binary file (45.4 kB). View file

wandb/run-20260502_212127-vl8pftkj/run-vl8pftkj.wandb ADDED Viewed

Binary file (65.8 kB). View file

wandb/run-20260502_213822-mmm9bdu9/run-mmm9bdu9.wandb ADDED Viewed

Binary file (54.7 kB). View file

wandb/run-20260503_120130-xzkygl93/run-xzkygl93.wandb ADDED Viewed

Binary file (4.92 kB). View file

wandb/run-20260503_120403-cbb6slr5/run-cbb6slr5.wandb ADDED Viewed

Binary file (64.9 kB). View file