Spaces:

AyoubChLin
/

classifier-general

Sleeping

App Files Files Community

AyoubChLin commited on 14 days ago

Commit

2d0ef3b

1 Parent(s): aa47fca

feat: update classifier model to local zero-shot NLI and enhance language detection with local library

Browse files

Files changed (20) hide show

.env.example +2 -4
README.md +4 -2
app/core/config.py +2 -4
app/pipelines/classification_pipeline.py +3 -1
app/services/classifier_service.py +61 -20
app/services/extraction_service.py +5 -4
app/services/language_service.py +13 -29
docker-compose.yml +2 -4
docs/explanation/architecture.md +7 -7
docs/explanation/decisions.md +5 -6
docs/how-to/deploy-with-docker-compose.md +1 -1
docs/how-to/run-locally.md +3 -4
docs/reference/api.md +1 -1
docs/reference/configuration.md +3 -4
docs/reference/runtime-state.md +2 -2
docs/tutorials/getting-started.md +1 -1
requirements.txt +1 -1
tests/test_classification_pipeline_behavior.py +37 -0
tests/test_classifier_service.py +94 -0
tests/test_language_service.py +34 -0

.env.example CHANGED Viewed

@@ -5,10 +5,8 @@ DEBUG=false
 STATIC_DIR=static
 UPLOAD_SUBDIR=uploads
-CLASSIFIER_MODEL=AyoubChLin/distilbert_cnn_news
 HUGGINGFACE_TOKEN=
-LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
-REQUEST_TIMEOUT_SECONDS=30
 DEFAULT_LABELS_CSV=news,sport,finance,politics

 STATIC_DIR=static
 UPLOAD_SUBDIR=uploads
+CLASSIFIER_MODEL=AyoubChLin/bert-base-uncased-zeroshot-nli
+ENABLE_MODEL_QUANTIZATION=true
 HUGGINGFACE_TOKEN=
 DEFAULT_LABELS_CSV=news,sport,finance,politics

README.md CHANGED Viewed

@@ -40,8 +40,8 @@ cp .env.example .env
 Key vars:
 - `CLASSIFIER_MODEL`
 - `HUGGINGFACE_TOKEN`
-- `LANGUAGE_DETECTOR_URL`
 - `DEFAULT_LABELS_CSV`
 ## Local Run
@@ -63,4 +63,6 @@ pytest -q
 ## Notes
 - OCR requires `tesseract-ocr` (installed in Dockerfile).
 - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
-- The classifier model is loaded directly from Hugging Face Hub (no external Space dependency).

 Key vars:
 - `CLASSIFIER_MODEL`
+- `ENABLE_MODEL_QUANTIZATION`
 - `HUGGINGFACE_TOKEN`
 - `DEFAULT_LABELS_CSV`
 ## Local Run
 ## Notes
 - OCR requires `tesseract-ocr` (installed in Dockerfile).
 - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
+- The classifier model is loaded directly from Hugging Face Hub and runs true zero-shot classification over runtime labels.
+- Language detection runs locally via `langdetect` (no remote language endpoint dependency).
+- `/classify` uses only the first PDF page for classification; `/api/transformer` still extracts full content.

app/core/config.py CHANGED Viewed

@@ -15,12 +15,10 @@ class Settings(BaseSettings):
     static_dir: Path = Path("static")
     upload_subdir: str = "uploads"
-    classifier_model: str = "AyoubChLin/distilbert_cnn_news"
     huggingface_token: str | None = None
-    language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
-    request_timeout_seconds: float = 30.0
     default_labels_csv: str = Field(default="news,sport,finance,politics")
     @property

     static_dir: Path = Path("static")
     upload_subdir: str = "uploads"
+    classifier_model: str = "AyoubChLin/bert-base-uncased-zeroshot-nli"
+    enable_model_quantization: bool = True
     huggingface_token: str | None = None
     default_labels_csv: str = Field(default="news,sport,finance,politics")
     @property

app/pipelines/classification_pipeline.py CHANGED Viewed

@@ -25,7 +25,9 @@ class ClassificationPipeline:
         return text
     def classify_file(self, original_filename: str, file_path: Path) -> dict:
-        text = self.transform_file(original_filename, file_path)
         preprocessed_text = preprocess_text(text)
         language = language_service.detect_language(preprocessed_text)

         return text
     def classify_file(self, original_filename: str, file_path: Path) -> dict:
+        text = extraction_service.extract_text(original_filename, file_path, pdf_first_page_only=True)
+        if not text or not text.strip():
+            raise ExtractionError("No text extracted from file")
         preprocessed_text = preprocess_text(text)
         language = language_service.detect_language(preprocessed_text)

app/services/classifier_service.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Any
 import torch
@@ -6,8 +7,12 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from app.core.config import settings
 from app.core.exceptions import ClassificationError
 class ClassifierService:
     def __init__(self) -> None:
         self._tokenizer: Any | None = None
         self._model: Any | None = None
@@ -15,21 +20,34 @@ class ClassifierService:
     def _load_model(self) -> tuple[Any, Any]:
         if self._tokenizer is None or self._model is None:
             try:
-                tokenizer = AutoTokenizer.from_pretrained(settings.classifier_model)
-                model = AutoModelForSequenceClassification.from_pretrained(settings.classifier_model)
                 model.eval()
                 model.to("cpu")
-                # Dynamic INT8 quantization for CPU inference.
-                quantized_model = torch.quantization.quantize_dynamic(
-                    model,
-                    {torch.nn.Linear},
-                    dtype=torch.qint8,
-                )
                 self._tokenizer = tokenizer
-                self._model = quantized_model
             except Exception as exc:
                 raise ClassificationError("Unable to initialize classifier model") from exc
@@ -38,31 +56,54 @@ class ClassifierService:
     def warmup(self) -> None:
         self._load_model()
     def classify(self, text: str, labels: list[str]) -> str:
-        if not labels:
             raise ClassificationError("No labels configured")
         tokenizer, model = self._load_model()
         try:
             inputs = tokenizer(
-                text,
                 padding=True,
-                truncation=True,
                 return_tensors="pt",
             )
             with torch.no_grad():
                 logits = model(**inputs).logits
-            predicted_class_id = logits.argmax(dim=-1).item()
-            predicted_label = model.config.id2label.get(predicted_class_id)
-            if predicted_label:
-                return str(predicted_label)
         except Exception as exc:
             raise ClassificationError("Classifier prediction failed") from exc
-        raise ClassificationError("Classifier did not return a valid label")
 classifier_service = ClassifierService()

+import logging
 from typing import Any
 import torch
 from app.core.config import settings
 from app.core.exceptions import ClassificationError
+logger = logging.getLogger(__name__)
 class ClassifierService:
+    _HYPOTHESIS_TEMPLATE = "This text is about {}."
     def __init__(self) -> None:
         self._tokenizer: Any | None = None
         self._model: Any | None = None
     def _load_model(self) -> tuple[Any, Any]:
         if self._tokenizer is None or self._model is None:
             try:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    settings.classifier_model,
+                    token=settings.huggingface_token,
+                )
+                model = AutoModelForSequenceClassification.from_pretrained(
+                    settings.classifier_model,
+                    token=settings.huggingface_token,
+                )
                 model.eval()
                 model.to("cpu")
+                if settings.enable_model_quantization:
+                    try:
+                        # Dynamic INT8 quantization for CPU inference.
+                        quantized_model = torch.ao.quantization.quantize_dynamic(
+                            model,
+                            {torch.nn.Linear},
+                            dtype=torch.qint8,
+                        )
+                        model = quantized_model
+                    except Exception:
+                        logger.warning(
+                            "Model quantization failed; using non-quantized model instead.",
+                            exc_info=True,
+                        )
                 self._tokenizer = tokenizer
+                self._model = model
             except Exception as exc:
                 raise ClassificationError("Unable to initialize classifier model") from exc
     def warmup(self) -> None:
         self._load_model()
+    @staticmethod
+    def _normalize_labels(labels: list[str]) -> list[str]:
+        cleaned = [label.strip() for label in labels if isinstance(label, str) and label.strip()]
+        return list(dict.fromkeys(cleaned))
+    @staticmethod
+    def _resolve_entailment_id(model: Any) -> int:
+        label2id = getattr(model.config, "label2id", {}) or {}
+        for label, label_id in label2id.items():
+            if isinstance(label, str) and label.lower().startswith("entail"):
+                return int(label_id)
+        id2label = getattr(model.config, "id2label", {}) or {}
+        for label_id, label in id2label.items():
+            if isinstance(label, str) and label.lower().startswith("entail"):
+                return int(label_id)
+        raise ClassificationError("Classifier model is missing an entailment label mapping")
     def classify(self, text: str, labels: list[str]) -> str:
+        candidate_labels = self._normalize_labels(labels)
+        if not candidate_labels:
             raise ClassificationError("No labels configured")
         tokenizer, model = self._load_model()
+        entailment_id = self._resolve_entailment_id(model)
         try:
+            sequence_pairs = [[text, self._HYPOTHESIS_TEMPLATE.format(label)] for label in candidate_labels]
             inputs = tokenizer(
+                sequence_pairs,
                 padding=True,
+                truncation="only_first",
                 return_tensors="pt",
             )
             with torch.no_grad():
                 logits = model(**inputs).logits
+            if logits.ndim != 2:
+                raise ClassificationError("Classifier returned unexpected logits shape")
+            if entailment_id < 0 or entailment_id >= logits.shape[-1]:
+                raise ClassificationError("Entailment label index is out of range for classifier output")
+            entailment_logits = logits[:, entailment_id]
+            best_index = int(torch.argmax(entailment_logits).item())
+            return candidate_labels[best_index]
         except Exception as exc:
             raise ClassificationError("Classifier prediction failed") from exc
 classifier_service = ClassifierService()

app/services/extraction_service.py CHANGED Viewed

@@ -16,10 +16,11 @@ TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".json"}
 class ExtractionService:
     @staticmethod
-    def _extract_pdf(file_path: Path) -> str:
         reader = PdfReader(str(file_path))
         chunks: list[str] = []
-        for page in reader.pages:
             text = page.extract_text() or ""
             if text.strip():
                 chunks.append(text)
@@ -41,13 +42,13 @@ class ExtractionService:
         workbook.close()
         return "\n".join(chunks)
-    def extract_text(self, file_name: str, file_path: Path) -> str:
         extension = Path(file_name).suffix.lower()
         try:
             if extension in DOC_EXTENSIONS:
                 if extension == ".pdf":
-                    return self._extract_pdf(file_path)
                 if extension == ".docx":
                     return self._extract_docx(file_path)
                 if extension == ".xlsx":

 class ExtractionService:
     @staticmethod
+    def _extract_pdf(file_path: Path, first_page_only: bool = False) -> str:
         reader = PdfReader(str(file_path))
         chunks: list[str] = []
+        pages = reader.pages[:1] if first_page_only else reader.pages
+        for page in pages:
             text = page.extract_text() or ""
             if text.strip():
                 chunks.append(text)
         workbook.close()
         return "\n".join(chunks)
+    def extract_text(self, file_name: str, file_path: Path, pdf_first_page_only: bool = False) -> str:
         extension = Path(file_name).suffix.lower()
         try:
             if extension in DOC_EXTENSIONS:
                 if extension == ".pdf":
+                    return self._extract_pdf(file_path, first_page_only=pdf_first_page_only)
                 if extension == ".docx":
                     return self._extract_docx(file_path)
                 if extension == ".xlsx":

app/services/language_service.py CHANGED Viewed

@@ -1,41 +1,25 @@
-import requests
-from app.core.config import settings
 from app.core.exceptions import LanguageDetectionError
-class LanguageService:
-    def __init__(self) -> None:
-        self._session = requests.Session()
     def detect_language(self, text: str) -> str:
         try:
-            response = self._session.post(
-                settings.language_detector_url,
-                json={"data": [text]},
-                timeout=settings.request_timeout_seconds,
-            )
-            response.raise_for_status()
-            payload = response.json()
-        except requests.RequestException as exc:
-            raise LanguageDetectionError("Language detection request failed") from exc
-        except ValueError as exc:
-            raise LanguageDetectionError("Language detector returned invalid JSON") from exc
-        data = payload.get("data") if isinstance(payload, dict) else None
-        if not data or not isinstance(data, list):
-            raise LanguageDetectionError("Language detector response missing 'data' field")
-        first = data[0]
-        if isinstance(first, dict):
-            label = first.get("label")
-        else:
-            label = first
-        if not isinstance(label, str) or not label.strip():
             raise LanguageDetectionError("Language detector did not return a valid label")
-        return label.strip()
 language_service = LanguageService()

+from langdetect import DetectorFactory, LangDetectException, detect
 from app.core.exceptions import LanguageDetectionError
+# Ensure deterministic language detection outcomes across runs.
+DetectorFactory.seed = 0
+class LanguageService:
     def detect_language(self, text: str) -> str:
         try:
+            language = detect(text)
+        except LangDetectException as exc:
+            raise LanguageDetectionError("Language detection failed") from exc
+        except Exception as exc:
+            raise LanguageDetectionError("Language detector raised an unexpected error") from exc
+        normalized_language = language.split("-", 1)[0].strip().lower() if isinstance(language, str) else ""
+        if not normalized_language:
             raise LanguageDetectionError("Language detector did not return a valid label")
+        return normalized_language
 language_service = LanguageService()

docker-compose.yml CHANGED Viewed

@@ -10,11 +10,9 @@ services:
       DEBUG: ${DEBUG:-false}
       STATIC_DIR: ${STATIC_DIR:-static}
       UPLOAD_SUBDIR: ${UPLOAD_SUBDIR:-uploads}
-      CLASSIFIER_SPACE: ${CLASSIFIER_SPACE:-https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/}
-      CLASSIFIER_API_NAME: ${CLASSIFIER_API_NAME:-/predict}
       HUGGINGFACE_TOKEN: ${HUGGINGFACE_TOKEN:-}
-      LANGUAGE_DETECTOR_URL: ${LANGUAGE_DETECTOR_URL:-https://team-language-detector-languagedetector.hf.space/run/predict}
-      REQUEST_TIMEOUT_SECONDS: ${REQUEST_TIMEOUT_SECONDS:-30}
       DEFAULT_LABELS_CSV: ${DEFAULT_LABELS_CSV:-news,sport,finance,politics}
     ports:
       - "7860:7860"

       DEBUG: ${DEBUG:-false}
       STATIC_DIR: ${STATIC_DIR:-static}
       UPLOAD_SUBDIR: ${UPLOAD_SUBDIR:-uploads}
+      CLASSIFIER_MODEL: ${CLASSIFIER_MODEL:-AyoubChLin/bert-base-uncased-zeroshot-nli}
+      ENABLE_MODEL_QUANTIZATION: ${ENABLE_MODEL_QUANTIZATION:-true}
       HUGGINGFACE_TOKEN: ${HUGGINGFACE_TOKEN:-}
       DEFAULT_LABELS_CSV: ${DEFAULT_LABELS_CSV:-news,sport,finance,politics}
     ports:
       - "7860:7860"

docs/explanation/architecture.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # Architecture Explanation
 ## 1. Executive summary
-`classifier-general` is a single FastAPI service that classifies text and files by combining local extraction/preprocessing with two remote AI endpoints (topic classifier and language detector).
 Evidence:
 - `app/main.py`
@@ -40,8 +40,7 @@ Evidence:
 ### Context view
 Actors/systems:
 - API client sending text/files.
-- External classifier endpoint (`CLASSIFIER_SPACE`).
-- External language detector endpoint (`LANGUAGE_DETECTOR_URL`).
 - Local filesystem for uploaded files.
 Evidence:
@@ -67,9 +66,10 @@ Evidence:
 1. `POST /classify` receives file.
 2. File saved to upload directory.
 3. Text extracted by extension-specific handlers.
 4. Text preprocessed (regex cleanup + min words).
-5. Language detector called.
-6. Classifier called with CSV labels converted to joined text.
 7. Response returns `{label, language}` plus `type=not english` when applicable.
 Evidence:
@@ -82,7 +82,7 @@ Evidence:
 ## 4. Cross-cutting concerns
 ### Validation and error mapping
 - Input schemas use strict `extra=forbid`.
-- Error mapping explicitly separates validation/extraction (400) from upstream AI failures (502).
 Evidence:
 - `app/schemas/classification.py`
@@ -112,7 +112,7 @@ Evidence:
 - `tests/test_routes.py`
 ## 5. Risks, gaps, and technical debt
-- External endpoint dependency introduces latency and runtime failure risk.
 - No upload retention/cleanup process.
 - Readiness check does not probe external AI services, only local label readiness.
 - No authentication/authorization layer on API endpoints.

 # Architecture Explanation
 ## 1. Executive summary
+`classifier-general` is a single FastAPI service that classifies text and files with local extraction/preprocessing, a local Hugging Face zero-shot NLI model, and local language detection.
 Evidence:
 - `app/main.py`
 ### Context view
 Actors/systems:
 - API client sending text/files.
+- Hugging Face model hub (model download/auth when needed).
 - Local filesystem for uploaded files.
 Evidence:
 1. `POST /classify` receives file.
 2. File saved to upload directory.
 3. Text extracted by extension-specific handlers.
+   - For `/classify`, PDF extraction is first-page only.
 4. Text preprocessed (regex cleanup + min words).
+5. Local language detector called.
+6. Zero-shot NLI classifier scores runtime labels and selects top label.
 7. Response returns `{label, language}` plus `type=not english` when applicable.
 Evidence:
 ## 4. Cross-cutting concerns
 ### Validation and error mapping
 - Input schemas use strict `extra=forbid`.
+- Error mapping explicitly separates validation/extraction (400) from classifier/language inference failures (502).
 Evidence:
 - `app/schemas/classification.py`
 - `tests/test_routes.py`
 ## 5. Risks, gaps, and technical debt
+- Local model initialization can fail if model/token/resources are invalid.
 - No upload retention/cleanup process.
 - Readiness check does not probe external AI services, only local label readiness.
 - No authentication/authorization layer on API endpoints.

docs/explanation/decisions.md CHANGED Viewed

@@ -21,23 +21,22 @@
 - Rationale:
   - Keep clients functional while refactoring internals.
-## ADR-003: Use remote HF/Gradio endpoint for classification
 - Status: Accepted
 - Type: Explicit
 - Evidence:
   - `app/core/config.py`
   - `app/services/classifier_service.py`
 - Rationale:
-  - Avoid shipping local model runtime in this service.
-## ADR-004: Use remote language detector HTTP endpoint
 - Status: Accepted
 - Type: Explicit
 - Evidence:
   - `app/services/language_service.py`
-  - `app/core/config.py`
 - Rationale:
-  - Decouple language detection model from this codebase.
 ## ADR-005: Keep labels in in-memory mutable config
 - Status: Accepted (current), Needs review
@@ -68,7 +67,7 @@
   - `app/routers/classification.py`
   - `app/core/exceptions.py`
 - Rationale:
-  - Differentiate local validation issues (`400`) from upstream AI failures (`502`).
 ## ADR-008: No built-in auth layer for this API
 - Status: Accepted (current), Needs review

 - Rationale:
   - Keep clients functional while refactoring internals.
+## ADR-003: Use local Hugging Face zero-shot NLI model for classification
 - Status: Accepted
 - Type: Explicit
 - Evidence:
   - `app/core/config.py`
   - `app/services/classifier_service.py`
 - Rationale:
+  - Perform true runtime-label zero-shot classification with local inference control.
+## ADR-004: Use local `langdetect` library for language detection
 - Status: Accepted
 - Type: Explicit
 - Evidence:
   - `app/services/language_service.py`
 - Rationale:
+  - Remove external dependency and keep language inference local.
 ## ADR-005: Keep labels in in-memory mutable config
 - Status: Accepted (current), Needs review
   - `app/routers/classification.py`
   - `app/core/exceptions.py`
 - Rationale:
+  - Differentiate local validation issues (`400`) from inference failures (`502`).
 ## ADR-008: No built-in auth layer for this API
 - Status: Accepted (current), Needs review

docs/how-to/deploy-with-docker-compose.md CHANGED Viewed

@@ -22,7 +22,7 @@ curl -s http://localhost:4002/health/liveness
 ## Production hardening gaps
 - No reverse proxy/TLS config in this repo.
-- External AI dependencies are hard network dependencies at runtime.
 - No horizontal scaling coordination for in-memory labels (`/configlabel` mutates process-local state).
 Evidence:

 ## Production hardening gaps
 - No reverse proxy/TLS config in this repo.
+- Initial model pull can require network access if the HF cache is cold.
 - No horizontal scaling coordination for in-memory labels (`/configlabel` mutates process-local state).
 Evidence:

docs/how-to/run-locally.md CHANGED Viewed

@@ -17,9 +17,8 @@ cp .env.example .env
 ```
 Critical settings:
-- `CLASSIFIER_SPACE`
-- `CLASSIFIER_API_NAME`
-- `LANGUAGE_DETECTOR_URL`
 - `DEFAULT_LABELS_CSV`
 Evidence:
@@ -54,7 +53,7 @@ Evidence:
 - `400 Text must contain at least 4 words`:
   - input failed preprocessing minimum-word rule.
 - `502 Classifier request failed`:
-  - HF Space unreachable or incompatible response.
 - OCR extraction quality is low:
   - verify tesseract install and image quality.

 ```
 Critical settings:
+- `CLASSIFIER_MODEL`
+- `ENABLE_MODEL_QUANTIZATION`
 - `DEFAULT_LABELS_CSV`
 Evidence:
 - `400 Text must contain at least 4 words`:
   - input failed preprocessing minimum-word rule.
 - `502 Classifier request failed`:
+  - local model load or prediction failed (model ID/token/resource issue).
 - OCR extraction quality is low:
   - verify tesseract install and image quality.

docs/reference/api.md CHANGED Viewed

@@ -23,7 +23,7 @@ Evidence:
 ## Validation and errors
 - `400` for input validation and extraction problems.
-- `502` for upstream classifier/language failures.
 - `500` for unexpected failures.
 Evidence:

 ## Validation and errors
 - `400` for input validation and extraction problems.
+- `502` for classifier/language inference failures.
 - `500` for unexpected failures.
 Evidence:

docs/reference/configuration.md CHANGED Viewed

@@ -26,16 +26,15 @@ Evidence:
 | Variable | Default | Purpose |
 |---|---|---|
-| `CLASSIFIER_SPACE` | `https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/` | Gradio/HF Space endpoint |
-| `CLASSIFIER_API_NAME` | `/predict` | Gradio predict API name |
 | `HUGGINGFACE_TOKEN` | empty | optional auth token for client init |
 ## Language detector settings
 | Variable | Default | Purpose |
 |---|---|---|
-| `LANGUAGE_DETECTOR_URL` | `https://team-language-detector-languagedetector.hf.space/run/predict` | remote language endpoint |
-| `REQUEST_TIMEOUT_SECONDS` | `30` | HTTP timeout for language requests |
 ## Label settings

 | Variable | Default | Purpose |
 |---|---|---|
+| `CLASSIFIER_MODEL` | `AyoubChLin/bert-base-uncased-zeroshot-nli` | Hugging Face model ID used for local zero-shot NLI classification |
+| `ENABLE_MODEL_QUANTIZATION` | `true` | enable dynamic INT8 quantization with automatic fallback |
 | `HUGGINGFACE_TOKEN` | empty | optional auth token for client init |
 ## Language detector settings
 | Variable | Default | Purpose |
 |---|---|---|
+| none | n/a | language detection now uses local `langdetect` |
 ## Label settings

docs/reference/runtime-state.md CHANGED Viewed

@@ -33,8 +33,8 @@ Evidence:
 | Dependency | Usage |
 |---|---|
-| HF/Gradio classifier Space | text topic classification |
-| Language detector endpoint | language inference |
 | Tesseract binary | OCR extraction for images |
 Evidence:

 | Dependency | Usage |
 |---|---|
+| Hugging Face model hub | model download/auth for local classifier initialization |
+| `langdetect` library | local language inference |
 | Tesseract binary | OCR extraction for images |
 Evidence:

docs/tutorials/getting-started.md CHANGED Viewed

@@ -4,7 +4,7 @@ This tutorial runs the classifier API and validates endpoint contracts.
 ## Prerequisites
 - Docker and Docker Compose
-- Internet access for external classifier/language services (unless tests are monkeypatched)
 Evidence:
 - `docker-compose.yml`

 ## Prerequisites
 - Docker and Docker Compose
+- Internet access for initial Hugging Face model download (unless model is already cached)
 Evidence:
 - `docker-compose.yml`

requirements.txt CHANGED Viewed

@@ -2,8 +2,8 @@ fastapi==0.115.8
 uvicorn[standard]==0.34.0
 pydantic==2.10.6
 pydantic-settings==2.7.1
-requests==2.32.3
 python-multipart==0.0.20
 transformers==4.46.0
 torch==2.5.1

 uvicorn[standard]==0.34.0
 pydantic==2.10.6
 pydantic-settings==2.7.1
 python-multipart==0.0.20
+langdetect==1.0.9
 transformers==4.46.0
 torch==2.5.1

tests/test_classification_pipeline_behavior.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pathlib import Path
+from app.pipelines.classification_pipeline import classification_pipeline
+import app.pipelines.classification_pipeline as pipeline_module
+def test_classify_file_uses_pdf_first_page_only(monkeypatch):
+    extraction_flags: list[bool] = []
+    def _fake_extract_text(file_name, file_path, pdf_first_page_only=False):
+        extraction_flags.append(pdf_first_page_only)
+        return "This is enough content for preprocessing and classification."
+    monkeypatch.setattr(pipeline_module.extraction_service, "extract_text", _fake_extract_text)
+    monkeypatch.setattr(pipeline_module.language_service, "detect_language", lambda text: "en")
+    monkeypatch.setattr(pipeline_module.label_service, "get_labels", lambda: ["news", "sport"])
+    monkeypatch.setattr(pipeline_module.classifier_service, "classify", lambda text, labels: "news")
+    result = classification_pipeline.classify_file("sample.pdf", Path("sample.pdf"))
+    assert extraction_flags == [True]
+    assert result == {"label": "news", "language": "en"}
+def test_transform_file_uses_full_extraction(monkeypatch):
+    extraction_flags: list[bool] = []
+    def _fake_extract_text(file_name, file_path, pdf_first_page_only=False):
+        extraction_flags.append(pdf_first_page_only)
+        return "This is full extracted content."
+    monkeypatch.setattr(pipeline_module.extraction_service, "extract_text", _fake_extract_text)
+    content = classification_pipeline.transform_file("sample.pdf", Path("sample.pdf"))
+    assert extraction_flags == [False]
+    assert content == "This is full extracted content."

tests/test_classifier_service.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from types import SimpleNamespace
+import torch
+import app.services.classifier_service as classifier_module
+class _FakeTokenizer:
+    def __call__(self, sequence_pairs, padding, truncation, return_tensors):
+        batch_size = len(sequence_pairs)
+        return {
+            "input_ids": torch.ones((batch_size, 2), dtype=torch.long),
+            "attention_mask": torch.ones((batch_size, 2), dtype=torch.long),
+        }
+class _FakeInferenceModel:
+    def __init__(self, logits: torch.Tensor) -> None:
+        self._logits = logits
+        self.config = SimpleNamespace(
+            label2id={"CONTRADICTION": 0, "ENTAILMENT": 1},
+            id2label={0: "CONTRADICTION", 1: "ENTAILMENT"},
+        )
+    def __call__(self, **kwargs):
+        return SimpleNamespace(logits=self._logits)
+class _FakeLoadModel:
+    def __init__(self) -> None:
+        self.config = SimpleNamespace(
+            label2id={"CONTRADICTION": 0, "ENTAILMENT": 1},
+            id2label={0: "CONTRADICTION", 1: "ENTAILMENT"},
+        )
+    def eval(self):
+        return self
+    def to(self, device):
+        return self
+def test_classify_uses_runtime_candidate_labels(monkeypatch):
+    service = classifier_module.ClassifierService()
+    tokenizer = _FakeTokenizer()
+    model = _FakeInferenceModel(
+        logits=torch.tensor(
+            [
+                [3.2, 0.4],  # finance -> low entailment
+                [0.3, 4.1],  # sport -> highest entailment
+                [1.5, 1.9],  # politics -> second-best entailment
+            ]
+        )
+    )
+    monkeypatch.setattr(service, "_load_model", lambda: (tokenizer, model))
+    predicted = service.classify(
+        "This article discusses the latest football transfer strategy.",
+        ["finance", "sport", "politics"],
+    )
+    assert predicted == "sport"
+def test_model_quantization_falls_back_to_non_quantized_model(monkeypatch):
+    service = classifier_module.ClassifierService()
+    fake_model = _FakeLoadModel()
+    fake_tokenizer = object()
+    monkeypatch.setattr(
+        classifier_module.AutoTokenizer,
+        "from_pretrained",
+        lambda *args, **kwargs: fake_tokenizer,
+    )
+    monkeypatch.setattr(
+        classifier_module.AutoModelForSequenceClassification,
+        "from_pretrained",
+        lambda *args, **kwargs: fake_model,
+    )
+    monkeypatch.setattr(classifier_module.settings, "enable_model_quantization", True)
+    def _raise_quantization_error(*args, **kwargs):
+        raise RuntimeError("quantization backend unavailable")
+    monkeypatch.setattr(
+        classifier_module.torch.ao.quantization,
+        "quantize_dynamic",
+        _raise_quantization_error,
+    )
+    _, loaded_model = service._load_model()
+    assert loaded_model is fake_model

tests/test_language_service.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pytest
+from app.core.exceptions import LanguageDetectionError
+import app.services.language_service as language_module
+def test_detect_language_returns_en_for_english_and_non_en_for_french():
+    service = language_module.LanguageService()
+    english_text = "This is a detailed English sentence about technology trends and financial markets."
+    french_text = "Ceci est une phrase francaise detaillee sur les tendances technologiques et les marches financiers."
+    assert service.detect_language(english_text) == "en"
+    assert service.detect_language(french_text) != "en"
+def test_detect_language_raises_for_invalid_detector_output(monkeypatch):
+    service = language_module.LanguageService()
+    monkeypatch.setattr(language_module, "detect", lambda text: "")
+    with pytest.raises(LanguageDetectionError, match="did not return a valid label"):
+        service.detect_language("This text is long enough for language detection.")
+def test_detect_language_wraps_unexpected_detector_errors(monkeypatch):
+    service = language_module.LanguageService()
+    def _raise_error(_: str):
+        raise RuntimeError("unexpected detector failure")
+    monkeypatch.setattr(language_module, "detect", _raise_error)
+    with pytest.raises(LanguageDetectionError, match="unexpected error"):
+        service.detect_language("This text is long enough for language detection.")