morpheuslord commited on 7 days ago

Commit

12fd5f2

verified ·

1 Parent(s): 9dd64b9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/__pycache__/evaluate.cpython-312.pyc +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/__init__.cpython-314.pyc +0 -0
src/api/__init__.py +0 -0
src/api/__pycache__/main.cpython-312.pyc +0 -0
src/api/__pycache__/middleware.cpython-312.pyc +0 -0
src/api/__pycache__/schemas.cpython-312.pyc +0 -0
src/api/middleware.py +67 -0
src/api/schemas.py +21 -0
src/evaluation/__init__.py +0 -0
src/evaluation/__pycache__/__init__.cpython-314.pyc +0 -0
src/evaluation/__pycache__/authorship_verifier.cpython-312.pyc +0 -0
src/evaluation/__pycache__/errant_evaluator.cpython-312.pyc +0 -0
src/evaluation/__pycache__/gleu_scorer.cpython-312.pyc +0 -0
src/evaluation/__pycache__/gleu_scorer.cpython-314.pyc +0 -0
src/evaluation/__pycache__/style_metrics.cpython-312.pyc +0 -0
src/evaluation/__pycache__/style_metrics.cpython-314.pyc +0 -0
src/evaluation/authorship_verifier.py +50 -0
src/evaluation/errant_evaluator.py +82 -0
src/evaluation/gleu_scorer.py +68 -0
src/evaluation/style_metrics.py +81 -0
src/inference/__init__.py +0 -0
src/inference/__pycache__/__init__.cpython-314.pyc +0 -0
src/inference/__pycache__/corrector.cpython-312.pyc +0 -0
src/inference/__pycache__/corrector.cpython-314.pyc +0 -0
src/inference/__pycache__/postprocessor.cpython-312.pyc +0 -0
src/inference/__pycache__/postprocessor.cpython-314.pyc +0 -0
src/inference/corrector.py +283 -0
src/inference/postprocessor.py +118 -0
src/model/__init__.py +0 -0
src/model/__pycache__/__init__.cpython-312.pyc +0 -0
src/model/__pycache__/__init__.cpython-314.pyc +0 -0
src/model/__pycache__/base_model.cpython-312.pyc +0 -0
src/model/__pycache__/base_model.cpython-314.pyc +0 -0
src/model/__pycache__/generation_utils.cpython-312.pyc +0 -0
src/model/__pycache__/generation_utils.cpython-314.pyc +0 -0
src/model/__pycache__/lora_adapter.cpython-312.pyc +0 -0
src/model/__pycache__/style_conditioner.cpython-312.pyc +0 -0
src/model/__pycache__/style_conditioner.cpython-314.pyc +0 -0
src/model/base_model.py +135 -0
src/model/generation_utils.py +106 -0
src/model/lora_adapter.py +54 -0
src/model/style_conditioner.py +74 -0
src/preprocessing/__init__.py +0 -0
src/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
src/preprocessing/__pycache__/__init__.cpython-314.pyc +0 -0
src/preprocessing/__pycache__/dependency_parser.cpython-312.pyc +0 -0
src/preprocessing/__pycache__/dyslexia_simulator.cpython-312.pyc +0 -0
src/preprocessing/__pycache__/dyslexia_simulator.cpython-314.pyc +0 -0
src/preprocessing/__pycache__/ner_tagger.cpython-312.pyc +0 -0

scripts/__pycache__/evaluate.cpython-312.pyc ADDED Viewed

Binary file (5.24 kB). View file

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (169 Bytes). View file

src/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (171 Bytes). View file

src/api/__init__.py ADDED Viewed

File without changes

src/api/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (3.59 kB). View file

src/api/__pycache__/middleware.cpython-312.pyc ADDED Viewed

Binary file (3.68 kB). View file

src/api/__pycache__/schemas.cpython-312.pyc ADDED Viewed

Binary file (1.38 kB). View file

src/api/middleware.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+API middleware for request logging, rate limiting, and error handling.
+"""
+from fastapi import Request
+from fastapi.responses import JSONResponse
+from starlette.middleware.base import BaseHTTPMiddleware
+from loguru import logger
+import time
+from collections import defaultdict, deque
+class RequestLoggingMiddleware(BaseHTTPMiddleware):
+    """Logs all incoming requests with timing information."""
+    async def dispatch(self, request: Request, call_next):
+        start_time = time.time()
+        path = request.url.path
+        method = request.method
+        logger.info(f"→ {method} {path}")
+        try:
+            response = await call_next(request)
+        except Exception as e:
+            logger.error(f"✗ {method} {path} - Error: {e}")
+            raise
+        elapsed = (time.time() - start_time) * 1000  # ms
+        logger.info(f"← {method} {path} - {response.status_code} ({elapsed:.1f}ms)")
+        return response
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    """Simple in-memory rate limiting."""
+    def __init__(self, app, max_requests_per_minute: int = 60):
+        super().__init__(app)
+        self.max_requests = max_requests_per_minute
+        self.window = 60  # seconds
+        # Track requests per client IP: {ip: deque([timestamp, ...])}
+        self.requests: dict = defaultdict(deque)
+    async def dispatch(self, request: Request, call_next):
+        # Get client IP
+        client_ip = request.client.host if request.client else "unknown"
+        now = time.time()
+        # Clean old entries
+        timestamps = self.requests[client_ip]
+        while timestamps and timestamps[0] < now - self.window:
+            timestamps.popleft()
+        # Check rate limit
+        if len(timestamps) >= self.max_requests:
+            logger.warning(f"Rate limited: {client_ip} ({len(timestamps)} requests in {self.window}s)")
+            return JSONResponse(
+                status_code=429,
+                content={"detail": "Rate limit exceeded. Please wait before making more requests."},
+            )
+        # Record this request
+        timestamps.append(now)
+        response = await call_next(request)
+        return response

src/api/schemas.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Pydantic schemas for API request/response validation.
+"""
+from pydantic import BaseModel, Field
+from typing import Optional, Dict
+class CorrectionRequest(BaseModel):
+    text: str = Field(..., min_length=10, max_length=5000, description="Raw dyslectic text to correct.")
+    master_copy: Optional[str] = Field(None, description="Optional master copy to match style toward.")
+    style_alpha: float = Field(0.6, ge=0.0, le=1.0, description="Weight given to user's own style (0=full master, 1=full user).")
+class CorrectionResponse(BaseModel):
+    original: str
+    corrected: str
+    style_similarity: float
+    awl_coverage: float
+    readability: Dict[str, float]
+    changes_summary: str

src/evaluation/__init__.py ADDED Viewed

File without changes

src/evaluation/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (182 Bytes). View file

src/evaluation/__pycache__/authorship_verifier.cpython-312.pyc ADDED Viewed

Binary file (2.6 kB). View file

src/evaluation/__pycache__/errant_evaluator.cpython-312.pyc ADDED Viewed

Binary file (3.63 kB). View file

src/evaluation/__pycache__/gleu_scorer.cpython-312.pyc ADDED Viewed

Binary file (2.42 kB). View file

src/evaluation/__pycache__/gleu_scorer.cpython-314.pyc ADDED Viewed

Binary file (3.02 kB). View file

src/evaluation/__pycache__/style_metrics.cpython-312.pyc ADDED Viewed

Binary file (4.34 kB). View file

src/evaluation/__pycache__/style_metrics.cpython-314.pyc ADDED Viewed

Binary file (5.25 kB). View file

src/evaluation/authorship_verifier.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Authorship verification module.
+Uses a fine-tuned model to verify whether the corrected output
+could plausibly have been written by the same author as the input.
+Target: > 0.80 same-author probability.
+"""
+from typing import Tuple
+from loguru import logger
+import torch
+import torch.nn.functional as F
+class AuthorshipVerifier:
+    """Verifies authorship consistency between input and output text."""
+    def __init__(self, model_name: str = "roberta-base"):
+        try:
+            from sentence_transformers import SentenceTransformer
+            self.model = SentenceTransformer(model_name, device="cpu")
+            logger.info(f"AuthorshipVerifier loaded with {model_name}")
+        except Exception as e:
+            logger.warning(f"Failed to load authorship model: {e}")
+            self.model = None
+    def verify(self, text_a: str, text_b: str) -> float:
+        """Return probability that both texts were written by the same author.
+        Uses sentence embedding similarity as a proxy for authorship.
+        Higher cosine similarity suggests same author.
+        """
+        if self.model is None:
+            return 0.5  # Neutral score if model unavailable
+        if not text_a or not text_b:
+            return 0.5
+        try:
+            embeddings = self.model.encode([text_a, text_b], convert_to_tensor=True)
+            sim = F.cosine_similarity(
+                embeddings[0].unsqueeze(0),
+                embeddings[1].unsqueeze(0),
+            )
+            # Scale similarity to [0, 1] probability
+            # Cosine similarity is already in [-1, 1], shift to [0, 1]
+            prob = (sim.item() + 1.0) / 2.0
+            return prob
+        except Exception as e:
+            logger.warning(f"Authorship verification failed: {e}")
+            return 0.5

src/evaluation/errant_evaluator.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+ERRANT-based grammatical error evaluation.
+Uses the ERRANT toolkit for standardised GEC evaluation with
+precision, recall, and F0.5 scores.
+"""
+from typing import List, Dict
+from loguru import logger
+class ERRANTEvaluator:
+    """Evaluates grammar correction quality using ERRANT annotations."""
+    def __init__(self):
+        try:
+            import errant
+            self.annotator = errant.load("en")
+            logger.info("ERRANT annotator loaded")
+        except Exception as e:
+            logger.warning(f"ERRANT failed to load: {e}. Evaluation will use fallback.")
+            self.annotator = None
+    def evaluate(
+        self,
+        sources: List[str],
+        predictions: List[str],
+        references: List[str],
+    ) -> Dict[str, float]:
+        """Compute ERRANT precision, recall, F0.5."""
+        if self.annotator is None:
+            logger.warning("ERRANT not available, returning zero scores")
+            return {"precision": 0.0, "recall": 0.0, "f0.5": 0.0}
+        tp = 0
+        fp = 0
+        fn = 0
+        for src, pred, ref in zip(sources, predictions, references):
+            try:
+                # Parse source and annotate edits
+                orig = self.annotator.parse(src)
+                cor_pred = self.annotator.parse(pred)
+                cor_ref = self.annotator.parse(ref)
+                # Get edit annotations
+                pred_edits = self.annotator.annotate(orig, cor_pred)
+                ref_edits = self.annotator.annotate(orig, cor_ref)
+                # Convert to comparable sets of (start, end, correction, type)
+                pred_set = set()
+                for e in pred_edits:
+                    pred_set.add((e.o_start, e.o_end, e.c_str, e.type))
+                ref_set = set()
+                for e in ref_edits:
+                    ref_set.add((e.o_start, e.o_end, e.c_str, e.type))
+                # Count TP, FP, FN
+                tp += len(pred_set & ref_set)
+                fp += len(pred_set - ref_set)
+                fn += len(ref_set - pred_set)
+            except Exception as e:
+                logger.debug(f"ERRANT annotation failed for a sample: {e}")
+                continue
+        # Compute precision, recall, F0.5
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        # F0.5 weighs precision higher than recall (β=0.5)
+        beta = 0.5
+        if precision + recall > 0:
+            f_score = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
+        else:
+            f_score = 0.0
+        return {
+            "precision": precision,
+            "recall": recall,
+            "f0.5": f_score,
+        }

src/evaluation/gleu_scorer.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+GLEU (Generalized Language Evaluation Understanding) score.
+Preferred over BLEU for grammatical error correction tasks.
+Also computes BERTScore for semantic similarity evaluation.
+"""
+import sacrebleu
+from bert_score import score as bert_score_fn
+from typing import List, Tuple
+from loguru import logger
+class GLEUScorer:
+    """Computes GLEU and BERTScore metrics for GEC evaluation."""
+    def compute_gleu(
+        self,
+        predictions: List[str],
+        references: List[str],
+    ) -> float:
+        """Corpus-level GLEU score (0-100).
+        GLEU is the geometric mean of n-gram precisions and recall,
+        preferred over BLEU for GEC because it equally penalises
+        both under-correction and over-correction.
+        """
+        if not predictions or not references:
+            return 0.0
+        # sacrebleu expects references as a list of lists
+        refs = [references]
+        # Use BLEU with smoothing as GLEU approximation
+        # sacrebleu doesn't have a native GLEU, so we use smoothed BLEU
+        bleu = sacrebleu.corpus_bleu(
+            predictions,
+            refs,
+            smooth_method="exp",
+            smooth_value=0.1,
+        )
+        return bleu.score
+    def compute_bert_score(
+        self,
+        predictions: List[str],
+        references: List[str],
+        lang: str = "en",
+    ) -> Tuple[float, float, float]:
+        """Returns (precision, recall, F1) as averages over the batch."""
+        if not predictions or not references:
+            return (0.0, 0.0, 0.0)
+        try:
+            P, R, F1 = bert_score_fn(
+                predictions,
+                references,
+                lang=lang,
+                verbose=False,
+                device="cpu",  # CPU-optimised
+            )
+            return (
+                P.mean().item(),
+                R.mean().item(),
+                F1.mean().item(),
+            )
+        except Exception as e:
+            logger.warning(f"BERTScore computation failed: {e}")
+            return (0.0, 0.0, 0.0)

src/evaluation/style_metrics.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Measures style preservation between input and output.
+Key metrics:
+  - Style Vector Cosine Similarity (target: > 0.85)
+  - AWL Coverage Score (target: > 0.25)
+  - Authorship Verification Score (target: > 0.80)
+"""
+import torch
+import torch.nn.functional as F
+from typing import List, Tuple
+from ..style.fingerprinter import StyleFingerprinter
+from ..vocabulary.awl_loader import AWLLoader
+from loguru import logger
+import numpy as np
+class StyleEvaluator:
+    """Evaluates style preservation and academic vocabulary coverage."""
+    def __init__(self, fingerprinter: StyleFingerprinter, awl: AWLLoader):
+        self.fingerprinter = fingerprinter
+        self.awl = awl
+    def style_similarity(self, text_a: str, text_b: str) -> float:
+        """Cosine similarity between style vectors. Target: > 0.85."""
+        vec_a = self.fingerprinter.extract_vector(text_a)
+        vec_b = self.fingerprinter.extract_vector(text_b)
+        if vec_a.dim() == 1:
+            vec_a = vec_a.unsqueeze(0)
+        if vec_b.dim() == 1:
+            vec_b = vec_b.unsqueeze(0)
+        sim = F.cosine_similarity(vec_a, vec_b, dim=-1)
+        return sim.item()
+    def awl_coverage(self, text: str) -> float:
+        """Fraction of content words in AWL. Target: > 0.25."""
+        if not text or not text.strip():
+            return 0.0
+        words = text.lower().split()
+        # Filter to content words (longer than 3 chars, alphabetic)
+        content_words = [w for w in words if len(w) > 3 and w.isalpha()]
+        if not content_words:
+            return 0.0
+        awl_count = sum(1 for w in content_words if self.awl.is_academic(w))
+        return awl_count / len(content_words)
+    def evaluate_batch(
+        self,
+        inputs: List[str],
+        outputs: List[str],
+        references: List[str],
+    ) -> dict:
+        """Compute style and AWL metrics for a batch."""
+        style_sims = []
+        awl_coverages = []
+        ref_style_sims = []
+        for inp, out, ref in zip(inputs, outputs, references):
+            # Style similarity between input and output (preservation)
+            style_sims.append(self.style_similarity(inp, out))
+            # AWL coverage of output
+            awl_coverages.append(self.awl_coverage(out))
+            # Style similarity between output and reference
+            ref_style_sims.append(self.style_similarity(out, ref))
+        return {
+            "style_similarity_mean": float(np.mean(style_sims)),
+            "style_similarity_std": float(np.std(style_sims)),
+            "awl_coverage_mean": float(np.mean(awl_coverages)),
+            "awl_coverage_std": float(np.std(awl_coverages)),
+            "ref_style_similarity_mean": float(np.mean(ref_style_sims)),
+        }

src/inference/__init__.py ADDED Viewed

File without changes

src/inference/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (181 Bytes). View file

src/inference/__pycache__/corrector.cpython-312.pyc ADDED Viewed

Binary file (9.52 kB). View file

src/inference/__pycache__/corrector.cpython-314.pyc ADDED Viewed

Binary file (13.7 kB). View file

src/inference/__pycache__/postprocessor.cpython-312.pyc ADDED Viewed

Binary file (4.72 kB). View file

src/inference/__pycache__/postprocessor.cpython-314.pyc ADDED Viewed

Binary file (5.69 kB). View file

src/inference/corrector.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+End-to-end inference pipeline.
+Accepts raw dyslectic text (and optionally a master copy),
+returns corrected academic text with metadata.
+"""
+from ..preprocessing.pipeline import PreprocessingPipeline
+from ..style.fingerprinter import StyleFingerprinter
+from ..vocabulary.lexical_substitution import LexicalElevator, RegisterFilter
+from ..model.base_model import load_model_and_tokenizer
+from ..model.style_conditioner import StyleConditioner, prepend_style_prefix
+from ..model.generation_utils import generate_correction
+from .postprocessor import PostProcessor
+from ..evaluation.style_metrics import StyleEvaluator
+from ..vocabulary.awl_loader import AWLLoader
+import torch
+from typing import Optional
+from dataclasses import dataclass
+from loguru import logger
+import yaml
+TASK_PREFIX = (
+    "Correct the following text for grammar, spelling, and clarity. "
+    "Maintain the author's original tone and writing style. "
+    "Elevate vocabulary to academic register. "
+    "Do NOT change the meaning or add new information. "
+    "Preserve named entities exactly. "
+    "Text to correct: "
+)
+@dataclass
+class CorrectionResult:
+    original: str
+    corrected: str
+    preprocessed: str
+    style_similarity: float
+    awl_coverage: float
+    readability: dict
+    changes_summary: str
+class AcademicCorrector:
+    """Full inference pipeline: preprocess → fingerprint → generate → elevate → filter."""
+    def __init__(self, config: dict):
+        logger.info("Initialising AcademicCorrector...")
+        model_cfg = config.get("model", {})
+        gen_cfg = config.get("generation", {})
+        vocab_cfg = config.get("vocabulary", {})
+        style_cfg = config.get("style_conditioner", {})
+        # 1. Load model and tokenizer
+        model_key = model_cfg.get("key", "flan-t5-small")
+        checkpoint = model_cfg.get("checkpoint_path", None)
+        use_lora = model_cfg.get("use_lora", False)
+        if checkpoint and use_lora:
+            # PEFT adapter checkpoint: load base model + apply adapter
+            import os
+            try:
+                from peft import PeftModel
+                logger.info(f"Loading base model '{model_key}' + PEFT adapter from '{checkpoint}'")
+                self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
+                    model_key, quantize=False, use_lora=False
+                )
+                self.model = PeftModel.from_pretrained(self.model, checkpoint)
+                logger.info(f"PEFT adapter loaded from {checkpoint}")
+            except Exception as e:
+                logger.warning(f"PEFT loading failed ({e}), loading base model only")
+                self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
+                    model_key, quantize=False, use_lora=False
+                )
+        elif checkpoint:
+            # Full model checkpoint (merged weights)
+            try:
+                from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+                self.model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+                self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+                self.is_seq2seq = True
+                logger.info(f"Loaded full model from checkpoint: {checkpoint}")
+            except Exception:
+                logger.warning(f"Checkpoint not found, loading base model: {model_key}")
+                self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
+                    model_key, quantize=False, use_lora=False
+                )
+        else:
+            self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
+                model_key, quantize=False, use_lora=False
+            )
+        self.model.eval()
+        self.generation_config = gen_cfg
+        # 2. Preprocessor
+        self.preprocessor = PreprocessingPipeline()
+        # 3. Style fingerprinter
+        fp_cfg = config.get("fingerprinter", {})
+        self.fingerprinter = StyleFingerprinter(
+            spacy_model=fp_cfg.get("spacy_model", "en_core_web_sm"),
+            awl_path=vocab_cfg.get("awl_path", "data/awl/coxhead_awl.txt"),
+        )
+        # 4. Style conditioner — auto-detect hidden dim from loaded model
+        if hasattr(self.model.config, "d_model"):
+            auto_hidden_dim = self.model.config.d_model
+        elif hasattr(self.model.config, "hidden_size"):
+            auto_hidden_dim = self.model.config.hidden_size
+        else:
+            auto_hidden_dim = 512  # Safe default for T5-Small
+        logger.info(f"Auto-detected model hidden dim: {auto_hidden_dim}")
+        self.conditioner = StyleConditioner(
+            style_dim=style_cfg.get("style_dim", 512),
+            model_hidden_dim=style_cfg.get("model_hidden_dim", auto_hidden_dim),
+            n_prefix_tokens=style_cfg.get("n_prefix_tokens", 10),
+        )
+        self.conditioner.eval()
+        # 5. Vocabulary elevator
+        try:
+            self.elevator = LexicalElevator(
+                awl_path=vocab_cfg.get("awl_path", "data/awl/coxhead_awl.txt"),
+                spacy_model="en_core_web_sm",
+                mlm_model=vocab_cfg.get("mlm_model", "bert-large-uncased"),
+                sem_model=vocab_cfg.get("sem_model", "all-mpnet-base-v2"),
+            )
+        except Exception as e:
+            logger.warning(f"Lexical elevator init failed: {e}, elevation disabled")
+            self.elevator = None
+        # 6. Register filter
+        self.register_filter = RegisterFilter()
+        # 7. Post-processor
+        self.postprocessor = PostProcessor()
+        # 8. Evaluator
+        awl = AWLLoader(primary_path=vocab_cfg.get("awl_path", "data/awl/coxhead_awl.txt"))
+        self.evaluator = StyleEvaluator(self.fingerprinter, awl)
+        logger.info("AcademicCorrector initialised successfully")
+    def correct(
+        self,
+        raw_text: str,
+        master_copy: Optional[str] = None,
+        style_alpha: float = 0.6,
+    ) -> CorrectionResult:
+        """
+        Full correction pipeline:
+        1. Pre-process (spell correct + parse)
+        2. Style fingerprint
+        3. Generate with style conditioning
+        4. Academic vocabulary elevation
+        5. Register filter
+        6. Compute quality metrics
+        """
+        # Step 1: Pre-process
+        logger.info("Step 1: Preprocessing...")
+        doc = self.preprocessor.process(raw_text)
+        # Step 2: Style fingerprint
+        logger.info("Step 2: Extracting style fingerprint...")
+        user_style = self.fingerprinter.extract_vector(doc.corrected_text)
+        if master_copy:
+            master_style = self.fingerprinter.extract_vector(master_copy)
+            target_style = self.fingerprinter.blend_vectors(user_style, master_style, alpha=style_alpha)
+        else:
+            target_style = user_style
+        # Step 3: Generate correction (sentence-chunked)
+        # The model was trained on max_input_length=128 tokens.
+        # Split text into sentence groups that fit within that window,
+        # process each chunk, then reassemble.
+        logger.info("Step 3: Generating correction (chunked)...")
+        MAX_INPUT_TOKENS = 128
+        # Measure how many tokens the task prefix uses
+        prefix_tokens = len(self.tokenizer.encode(TASK_PREFIX, add_special_tokens=False))
+        budget = MAX_INPUT_TOKENS - prefix_tokens - 2  # 2 for special tokens
+        # Split into sentences using spaCy (already loaded for fingerprinting)
+        sent_doc = self.fingerprinter.nlp(doc.corrected_text)
+        sentences = [sent.text.strip() for sent in sent_doc.sents if sent.text.strip()]
+        # Group sentences into chunks that fit the token budget
+        chunks = []
+        current_chunk = []
+        current_tokens = 0
+        for sent in sentences:
+            sent_tokens = len(self.tokenizer.encode(sent, add_special_tokens=False))
+            if current_tokens + sent_tokens > budget and current_chunk:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = [sent]
+                current_tokens = sent_tokens
+            else:
+                current_chunk.append(sent)
+                current_tokens += sent_tokens
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        logger.info(f"  Split into {len(chunks)} chunks from {len(sentences)} sentences")
+        corrected_chunks = []
+        device = next(self.model.parameters()).device
+        for i, chunk in enumerate(chunks):
+            chunk_input = TASK_PREFIX + chunk
+            inputs = self.tokenizer(
+                chunk_input,
+                max_length=MAX_INPUT_TOKENS,
+                truncation=True,
+                return_tensors="pt",
+            )
+            input_ids = inputs["input_ids"].to(device)
+            attention_mask = inputs["attention_mask"].to(device)
+            chunk_output = generate_correction(
+                self.model,
+                self.tokenizer,
+                input_ids,
+                attention_mask,
+                self.generation_config,
+            )
+            corrected_chunks.append(chunk_output)
+            logger.debug(f"  Chunk {i+1}/{len(chunks)}: {len(chunk.split())} → {len(chunk_output.split())} words")
+        generated = " ".join(corrected_chunks)
+        # Step 4: Post-process
+        logger.info("Step 4: Post-processing...")
+        generated = self.postprocessor.clean(generated)
+        generated = self.postprocessor.restore_entities(
+            generated,
+            [e.text for e in doc.entities],
+            doc.protected_spans,
+        )
+        # Step 5: Vocabulary elevation
+        logger.info("Step 5: Vocabulary elevation...")
+        if self.elevator:
+            try:
+                generated = self.elevator.elevate(generated, doc.protected_spans)
+            except Exception as e:
+                logger.warning(f"Vocabulary elevation failed: {e}")
+        # Step 6: Register filter
+        logger.info("Step 6: Register filtering...")
+        generated = self.register_filter.apply(generated)
+        # Final formatting
+        generated = self.postprocessor.format_output(generated)
+        # Step 7: Compute quality metrics
+        logger.info("Step 7: Computing metrics...")
+        style_sim = self.evaluator.style_similarity(raw_text, generated)
+        awl_cov = self.evaluator.awl_coverage(generated)
+        # Build changes summary
+        changes = []
+        if doc.original_text != doc.corrected_text:
+            changes.append("Spelling/grammar corrections applied")
+        if generated != doc.corrected_text:
+            changes.append("Text restructured and elevated")
+        changes_summary = "; ".join(changes) if changes else "No changes needed"
+        return CorrectionResult(
+            original=raw_text,
+            corrected=generated,
+            preprocessed=doc.corrected_text,
+            style_similarity=style_sim,
+            awl_coverage=awl_cov,
+            readability=doc.readability,
+            changes_summary=changes_summary,
+        )

src/inference/postprocessor.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Post-processing utilities for generated text.
+Handles cleanup, formatting, and final quality checks.
+"""
+import re
+from typing import List, Tuple
+from loguru import logger
+class PostProcessor:
+    """Cleans and formats generated text after model output."""
+    # Common generation artifacts to remove
+    ARTIFACTS = [
+        r'<pad>',
+        r'</s>',
+        r'<s>',
+        r'<unk>',
+        r'\[PAD\]',
+        r'\[CLS\]',
+        r'\[SEP\]',
+        r'<\|endoftext\|>',
+    ]
+    def __init__(self):
+        # Compile artifact removal regex
+        self._artifact_pattern = re.compile(
+            '|'.join(re.escape(a) if not a.startswith('\\') else a for a in self.ARTIFACTS),
+            re.IGNORECASE
+        )
+    def clean(self, text: str) -> str:
+        """Remove generation artifacts and normalise whitespace."""
+        if not text:
+            return ""
+        # Remove generation artifacts
+        result = self._artifact_pattern.sub('', text)
+        # Replace em dashes and en dashes with commas
+        result = result.replace('—', ',')
+        result = result.replace('–', ',')
+        # Normalise whitespace
+        result = re.sub(r'\s+', ' ', result)
+        result = result.strip()
+        # Fix common post-generation spacing issues
+        result = re.sub(r'\s+([.,!?;:])', r'\1', result)  # Remove space before punctuation
+        result = re.sub(r'([.,!?;:])([A-Za-z])', r'\1 \2', result)  # Add space after punctuation
+        result = re.sub(r'\(\s+', '(', result)  # Remove space after opening paren
+        result = re.sub(r'\s+\)', ')', result)  # Remove space before closing paren
+        # Fix multiple punctuation
+        result = re.sub(r'\.{2,}', '.', result)
+        result = re.sub(r'\?{2,}', '?', result)
+        result = re.sub(r'!{2,}', '!', result)
+        return result
+    def restore_entities(
+        self,
+        text: str,
+        original_entities: List[str],
+        protected_spans: List[Tuple[int, int]],
+    ) -> str:
+        """Restore named entities that may have been altered during generation.
+        Uses fuzzy matching to find where entities should be in the generated text
+        and restores the original form.
+        """
+        if not original_entities:
+            return text
+        result = text
+        for entity in original_entities:
+            # Check if entity is already present in correct form
+            if entity in result:
+                continue
+            # Try case-insensitive match
+            pattern = re.compile(re.escape(entity), re.IGNORECASE)
+            if pattern.search(result):
+                result = pattern.sub(entity, result, count=1)
+                logger.debug(f"Restored entity: {entity}")
+        return result
+    def format_output(self, text: str) -> str:
+        """Apply final formatting (capitalisation, punctuation, spacing)."""
+        if not text:
+            return ""
+        result = text.strip()
+        # Ensure first letter is capitalised
+        if result and result[0].islower():
+            result = result[0].upper() + result[1:]
+        # Ensure text ends with punctuation
+        if result and result[-1] not in '.!?':
+            result += '.'
+        # Capitalise after sentence-ending punctuation
+        result = re.sub(
+            r'([.!?]\s+)([a-z])',
+            lambda m: m.group(1) + m.group(2).upper(),
+            result
+        )
+        # Fix "i" → "I" when standalone
+        result = re.sub(r'\bi\b', 'I', result)
+        # Remove trailing whitespace from lines
+        result = '\n'.join(line.rstrip() for line in result.split('\n'))
+        return result

src/model/__init__.py ADDED Viewed

File without changes

src/model/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (175 Bytes). View file

src/model/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (177 Bytes). View file

src/model/__pycache__/base_model.cpython-312.pyc ADDED Viewed

Binary file (5.88 kB). View file

src/model/__pycache__/base_model.cpython-314.pyc ADDED Viewed

Binary file (6.22 kB). View file

src/model/__pycache__/generation_utils.cpython-312.pyc ADDED Viewed

Binary file (4.25 kB). View file

src/model/__pycache__/generation_utils.cpython-314.pyc ADDED Viewed

Binary file (4.81 kB). View file

src/model/__pycache__/lora_adapter.cpython-312.pyc ADDED Viewed

Binary file (2.9 kB). View file

src/model/__pycache__/style_conditioner.cpython-312.pyc ADDED Viewed

Binary file (3.09 kB). View file

src/model/__pycache__/style_conditioner.cpython-314.pyc ADDED Viewed

Binary file (3.61 kB). View file

src/model/base_model.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Loads and wraps the base pretrained model.
+Supported architectures:
+  - google/flan-t5-xl          (recommended, 3B)
+  - google/flan-t5-large       (780M, resource-constrained)
+  - facebook/bart-large        (400M, excellent denoiser)
+  - meta-llama/Meta-Llama-3.1-8B-Instruct (8B, best quality)
+"""
+from transformers import (
+    AutoTokenizer, AutoModelForSeq2SeqLM,
+    AutoModelForCausalLM, BitsAndBytesConfig
+)
+from peft import get_peft_model, LoraConfig, TaskType
+import torch
+from loguru import logger
+ENCODER_DECODER_MODELS = {
+    "flan-t5-xl": "google/flan-t5-xl",
+    "flan-t5-large": "google/flan-t5-large",
+    "flan-t5-base": "google/flan-t5-base",
+    "flan-t5-small": "google/flan-t5-small",
+    "bart-large": "facebook/bart-large",
+}
+DECODER_ONLY_MODELS = {
+    "llama-3.1-8b": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+}
+def load_model_and_tokenizer(model_key: str, quantize: bool = False, use_lora: bool = True,
+                              lora_config_dict: dict = None):
+    """
+    Load a pretrained model with optional LoRA and quantization.
+    Args:
+        model_key: Key from ENCODER_DECODER_MODELS or DECODER_ONLY_MODELS
+        quantize: Whether to use 4-bit quantization
+        use_lora: Whether to apply LoRA adapters
+        lora_config_dict: Optional dict with LoRA hyperparams (r, lora_alpha, etc.)
+    Returns:
+        Tuple of (model, tokenizer, is_seq2seq)
+    """
+    # Determine model type and HuggingFace identifier
+    is_seq2seq = model_key in ENCODER_DECODER_MODELS
+    is_causal = model_key in DECODER_ONLY_MODELS
+    if not is_seq2seq and not is_causal:
+        raise ValueError(
+            f"Unknown model key: '{model_key}'. "
+            f"Available: {list(ENCODER_DECODER_MODELS.keys()) + list(DECODER_ONLY_MODELS.keys())}"
+        )
+    model_name = ENCODER_DECODER_MODELS.get(model_key) or DECODER_ONLY_MODELS.get(model_key)
+    logger.info(f"Loading model: {model_name} (seq2seq={is_seq2seq}, quantize={quantize}, lora={use_lora})")
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Configure quantization if requested
+    model_kwargs = {
+        "torch_dtype": torch.float32,  # CPU-optimised: use float32 for stability
+    }
+    # Detect device
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+        # Use bfloat16 if Ampere+, else float16
+        if torch.cuda.get_device_capability()[0] >= 8:
+            model_kwargs["torch_dtype"] = torch.bfloat16
+        else:
+            model_kwargs["torch_dtype"] = torch.float16
+    if quantize and device == "cuda":
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
+            bnb_4bit_use_double_quant=True,
+        )
+        model_kwargs["quantization_config"] = bnb_config
+        logger.info("Using 4-bit NF4 quantization")
+    elif quantize and device == "cpu":
+        logger.warning("Quantization requested but no GPU available, skipping")
+    # Load model
+    if is_seq2seq:
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, **model_kwargs)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+    # Move to device if not quantized (quantized models are already on device)
+    if not quantize or device == "cpu":
+        model = model.to(device)
+    logger.info(f"Model loaded on {device} with dtype {model_kwargs.get('torch_dtype')}")
+    # Apply LoRA if requested
+    if use_lora:
+        lora_cfg = lora_config_dict or {}
+        task_type = TaskType.SEQ_2_SEQ_LM if is_seq2seq else TaskType.CAUSAL_LM
+        # Default target modules based on model architecture
+        default_targets = {
+            "flan-t5-xl": ["q", "v", "k", "o", "wi_0", "wi_1", "wo"],
+            "flan-t5-large": ["q", "v", "k", "o", "wi_0", "wi_1", "wo"],
+            "flan-t5-base": ["q", "v", "k", "o", "wi_0", "wi_1", "wo"],
+            "flan-t5-small": ["q", "v", "k", "o", "wi_0", "wi_1", "wo"],
+            "bart-large": ["q_proj", "v_proj", "k_proj", "out_proj"],
+            "llama-3.1-8b": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        }
+        lora_config = LoraConfig(
+            task_type=task_type,
+            r=lora_cfg.get("r", 16),
+            lora_alpha=lora_cfg.get("lora_alpha", 32),
+            lora_dropout=lora_cfg.get("lora_dropout", 0.05),
+            target_modules=lora_cfg.get("target_modules", default_targets.get(model_key, ["q", "v"])),
+            bias="none",
+        )
+        model = get_peft_model(model, lora_config)
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        total_params = sum(p.numel() for p in model.parameters())
+        logger.info(
+            f"LoRA applied: {trainable_params:,} trainable params / {total_params:,} total "
+            f"({100 * trainable_params / total_params:.2f}%)"
+        )
+    return model, tokenizer, is_seq2seq

src/model/generation_utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Generation utilities for text correction.
+Handles beam search, constrained decoding, and post-generation cleanup.
+"""
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizer
+from typing import Dict, Optional, List
+from loguru import logger
+def generate_correction(
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    generation_config: Dict,
+) -> str:
+    """Generate corrected text from input tokens."""
+    # Build generation kwargs from config
+    gen_kwargs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "max_new_tokens": generation_config.get("max_new_tokens", 512),
+        "num_beams": generation_config.get("num_beams", 5),
+        "length_penalty": generation_config.get("length_penalty", 1.0),
+        "no_repeat_ngram_size": generation_config.get("no_repeat_ngram_size", 3),
+        "min_length": generation_config.get("min_length", 10),
+        "early_stopping": generation_config.get("early_stopping", True),
+    }
+    # Optional sampling parameters
+    if generation_config.get("do_sample", False):
+        gen_kwargs["do_sample"] = True
+        gen_kwargs["temperature"] = generation_config.get("temperature", 0.7)
+        gen_kwargs["top_p"] = generation_config.get("top_p", 0.9)
+    else:
+        gen_kwargs["do_sample"] = False
+    with torch.no_grad():
+        output_ids = model.generate(**gen_kwargs)
+    # Decode, skipping special tokens
+    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return generated_text.strip()
+def batch_generate(
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    texts: List[str],
+    generation_config: Dict,
+    max_length: int = 512,
+) -> List[str]:
+    """Generate corrections for a batch of texts."""
+    if not texts:
+        return []
+    results = []
+    # Process in mini-batches to manage memory on CPU
+    batch_size = generation_config.get("batch_size", 4)
+    for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i:i + batch_size]
+        # Tokenise batch
+        inputs = tokenizer(
+            batch_texts,
+            max_length=max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        # Move to model device
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate
+        gen_kwargs = {
+            "max_new_tokens": generation_config.get("max_new_tokens", 512),
+            "num_beams": generation_config.get("num_beams", 5),
+            "length_penalty": generation_config.get("length_penalty", 1.0),
+            "no_repeat_ngram_size": generation_config.get("no_repeat_ngram_size", 3),
+            "early_stopping": generation_config.get("early_stopping", True),
+        }
+        if generation_config.get("do_sample", False):
+            gen_kwargs["do_sample"] = True
+            gen_kwargs["temperature"] = generation_config.get("temperature", 0.7)
+        with torch.no_grad():
+            output_ids = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **gen_kwargs,
+            )
+        # Decode each output
+        for output in output_ids:
+            text = tokenizer.decode(output, skip_special_tokens=True)
+            results.append(text.strip())
+        logger.debug(f"Generated batch {i // batch_size + 1}: {len(batch_texts)} texts")
+    return results

src/model/lora_adapter.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+LoRA adapter configuration and management.
+Wraps PEFT LoRA utilities for applying parameter-efficient
+fine-tuning to the base model.
+"""
+from peft import LoraConfig, TaskType, get_peft_model
+from typing import List, Optional
+from loguru import logger
+def create_lora_config(
+    task_type: TaskType,
+    r: int = 16,
+    lora_alpha: int = 32,
+    target_modules: Optional[List[str]] = None,
+    lora_dropout: float = 0.05,
+) -> LoraConfig:
+    """Create a LoRA configuration for the given task type."""
+    if target_modules is None:
+        target_modules = ["q", "v"]
+    config = LoraConfig(
+        task_type=task_type,
+        r=r,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        target_modules=target_modules,
+        bias="none",
+        inference_mode=False,
+    )
+    logger.info(f"Created LoRA config: r={r}, alpha={lora_alpha}, dropout={lora_dropout}")
+    return config
+def apply_lora(model, lora_config: LoraConfig):
+    """Apply LoRA adapters to a model and return the wrapped model."""
+    peft_model = get_peft_model(model, lora_config)
+    trainable = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in peft_model.parameters())
+    logger.info(f"LoRA applied: {trainable:,}/{total:,} trainable params ({100*trainable/total:.2f}%)")
+    return peft_model
+def merge_lora_weights(model):
+    """Merge LoRA weights into the base model for inference.
+    After merging, the model behaves like a regular model with
+    LoRA modifications baked in, removing the adapter overhead.
+    """
+    logger.info("Merging LoRA weights into base model...")
+    merged = model.merge_and_unload()
+    logger.info("LoRA weights merged successfully")
+    return merged

src/model/style_conditioner.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Injects the style vector into the model via soft prompt conditioning.
+The style vector is projected to the model's hidden dimension and
+prepended to the input token embeddings as virtual tokens.
+This technique is called "prefix tuning" / "style prefix injection".
+It biases the model's attention toward the desired output style
+without modifying the base model weights.
+For Flan-T5: injects into encoder input embeddings
+For BART: injects into encoder input embeddings
+For Llama: prepends to the full input context
+"""
+import torch
+import torch.nn as nn
+class StyleConditioner(nn.Module):
+    """
+    Projects a 512-dim style vector to n_prefix_tokens virtual tokens
+    in the model's embedding space.
+    """
+    def __init__(
+        self,
+        style_dim: int = 512,
+        model_hidden_dim: int = 512,    # T5-Small=512, Base=768, Large=1024, XL=2048
+        n_prefix_tokens: int = 10,      # Number of virtual prefix tokens
+    ):
+        super().__init__()
+        self.style_dim = style_dim
+        self.model_hidden_dim = model_hidden_dim
+        self.n_prefix_tokens = n_prefix_tokens
+        # Project style vector to prefix embeddings
+        # style_dim → n_prefix_tokens * model_hidden_dim
+        total_output_dim = n_prefix_tokens * model_hidden_dim
+        self.projection = nn.Sequential(
+            nn.Linear(style_dim, total_output_dim),
+            nn.Tanh(),
+        )
+    def forward(self, style_vector: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            style_vector: [batch_size, 512]
+        Returns:
+            prefix_embeddings: [batch_size, n_prefix_tokens, model_hidden_dim]
+        """
+        # Project: [batch, 512] → [batch, n_prefix * hidden_dim]
+        projected = self.projection(style_vector)
+        # Reshape: [batch, n_prefix * hidden_dim] → [batch, n_prefix, hidden_dim]
+        batch_size = style_vector.size(0)
+        prefix_embeddings = projected.view(batch_size, self.n_prefix_tokens, self.model_hidden_dim)
+        return prefix_embeddings
+def prepend_style_prefix(
+    input_embeddings: torch.Tensor,
+    style_prefix: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Concatenates style prefix to input embeddings along sequence dimension.
+    Args:
+        input_embeddings: [batch, seq_len, hidden_dim]
+        style_prefix: [batch, n_prefix, hidden_dim]
+    Returns:
+        [batch, n_prefix + seq_len, hidden_dim]
+    """
+    return torch.cat([style_prefix, input_embeddings], dim=1)

src/preprocessing/__init__.py ADDED Viewed

File without changes

src/preprocessing/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (183 Bytes). View file

src/preprocessing/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (185 Bytes). View file

src/preprocessing/__pycache__/dependency_parser.cpython-312.pyc ADDED Viewed

Binary file (3.65 kB). View file

src/preprocessing/__pycache__/dyslexia_simulator.cpython-312.pyc ADDED Viewed

Binary file (6.75 kB). View file

src/preprocessing/__pycache__/dyslexia_simulator.cpython-314.pyc ADDED Viewed

Binary file (8.15 kB). View file

src/preprocessing/__pycache__/ner_tagger.cpython-312.pyc ADDED Viewed

Binary file (2.7 kB). View file