Spaces:

AyoubChLin
/

classifier-general

Sleeping

App Files Files Community

AyoubChLin commited on 12 days ago

Commit

efddb2f

1 Parent(s): 50231a8

feat: update classifier model configuration and remove external dependencies

Browse files

Files changed (8) hide show

.env.example +1 -2
.qwen/settings.json +9 -0
.qwen/settings.json.orig +7 -0
Dockerfile +2 -0
README.md +2 -1
app/core/config.py +1 -2
app/services/classifier_service.py +17 -61
requirements.txt +5 -1

.env.example CHANGED Viewed

@@ -5,8 +5,7 @@ DEBUG=false
 STATIC_DIR=static
 UPLOAD_SUBDIR=uploads
-CLASSIFIER_SPACE=https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/
-CLASSIFIER_API_NAME=/predict
 HUGGINGFACE_TOKEN=
 LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict

 STATIC_DIR=static
 UPLOAD_SUBDIR=uploads
+CLASSIFIER_MODEL=AyoubChLin/distilbert_cnn_news
 HUGGINGFACE_TOKEN=
 LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict

.qwen/settings.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(pip install *)",
+      "Bash(pip3 install *)"
+    ]
+  },
+  "$version": 3
+}

.qwen/settings.json.orig ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(pip install *)"
+    ]
+  }
+}

Dockerfile CHANGED Viewed

@@ -12,6 +12,8 @@ RUN apt-get update \
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 4002

 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+RUN huggingface-cli login --token ${HUGGINGFACE_TOKEN} 2>/dev/null || true
 COPY . .
 EXPOSE 4002

README.md CHANGED Viewed

@@ -39,7 +39,7 @@ cp .env.example .env
 ```
 Key vars:
-- `CLASSIFIER_SPACE`
 - `HUGGINGFACE_TOKEN`
 - `LANGUAGE_DETECTOR_URL`
 - `DEFAULT_LABELS_CSV`
@@ -63,3 +63,4 @@ pytest -q
 ## Notes
 - OCR requires `tesseract-ocr` (installed in Dockerfile).
 - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.

 ```
 Key vars:
+- `CLASSIFIER_MODEL`
 - `HUGGINGFACE_TOKEN`
 - `LANGUAGE_DETECTOR_URL`
 - `DEFAULT_LABELS_CSV`
 ## Notes
 - OCR requires `tesseract-ocr` (installed in Dockerfile).
 - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
+- The classifier model is loaded directly from Hugging Face Hub (no external Space dependency).

app/core/config.py CHANGED Viewed

@@ -15,8 +15,7 @@ class Settings(BaseSettings):
     static_dir: Path = Path("static")
     upload_subdir: str = "uploads"
-    classifier_space: str = "https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/"
-    classifier_api_name: str = "/predict"
     huggingface_token: str | None = None
     language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"

     static_dir: Path = Path("static")
     upload_subdir: str = "uploads"
+    classifier_model: str = "AyoubChLin/distilbert_cnn_news"
     huggingface_token: str | None = None
     language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"

app/services/classifier_service.py CHANGED Viewed

@@ -1,8 +1,6 @@
-import json
-from pathlib import Path
 from typing import Any
-from gradio_client import Client
 from app.core.config import settings
 from app.core.exceptions import ClassificationError
@@ -10,72 +8,30 @@ from app.core.exceptions import ClassificationError
 class ClassifierService:
     def __init__(self) -> None:
-        self._client: Client | None = None
-    def _get_client(self) -> Client:
-        if self._client is not None:
-            return self._client
-        client_kwargs: dict[str, Any] = {}
-        if settings.huggingface_token:
-            client_kwargs["hf_token"] = settings.huggingface_token
-        try:
-            self._client = Client(settings.classifier_space, **client_kwargs)
-        except Exception as exc:
-            raise ClassificationError("Unable to initialize classifier client") from exc
-        return self._client
-    @staticmethod
-    def _extract_label(payload: Any) -> str | None:
-        if isinstance(payload, dict):
-            value = payload.get("label")
-            if isinstance(value, str) and value.strip():
-                return value.strip()
-            return None
-        if isinstance(payload, list):
-            for item in payload:
-                label = ClassifierService._extract_label(item)
-                if label:
-                    return label
-        return None
     def classify(self, text: str, labels: list[str]) -> str:
         if not labels:
             raise ClassificationError("No labels configured")
-        labels_text = ", ".join(labels)
         try:
-            result = self._get_client().predict(
-                text,
-                labels_text,
-                api_name=settings.classifier_api_name,
-            )
         except Exception as exc:
-            raise ClassificationError("Classifier request failed") from exc
-        if isinstance(result, str):
-            candidate_path = Path(result)
-            if candidate_path.exists():
-                try:
-                    parsed = json.loads(candidate_path.read_text(encoding="utf-8"))
-                except Exception as exc:
-                    raise ClassificationError("Classifier output file is not valid JSON") from exc
-                label = self._extract_label(parsed)
-                if label:
-                    return label
-            stripped = result.strip()
-            if stripped:
-                return stripped
-        label = self._extract_label(result)
-        if label:
-            return label
         raise ClassificationError("Classifier did not return a valid label")

 from typing import Any
+from transformers import pipeline
 from app.core.config import settings
 from app.core.exceptions import ClassificationError
 class ClassifierService:
     def __init__(self) -> None:
+        self._pipeline: Any | None = None
+    def _get_pipeline(self) -> Any:
+        if self._pipeline is None:
+            try:
+                self._pipeline = pipeline(
+                    "zero-shot-classification",
+                    model=settings.classifier_model,
+                )
+            except Exception as exc:
+                raise ClassificationError("Unable to initialize classifier pipeline") from exc
+        return self._pipeline
     def classify(self, text: str, labels: list[str]) -> str:
         if not labels:
             raise ClassificationError("No labels configured")
         try:
+            result = self._get_pipeline()(text, labels, multi_label=False)
         except Exception as exc:
+            raise ClassificationError("Classifier prediction failed") from exc
+        if isinstance(result, dict) and "labels" in result and result["labels"]:
+            return result["labels"][0]
         raise ClassificationError("Classifier did not return a valid label")

requirements.txt CHANGED Viewed

@@ -3,9 +3,13 @@ uvicorn[standard]==0.34.0
 pydantic==2.10.6
 pydantic-settings==2.7.1
 requests==2.32.3
-gradio_client==1.7.0
 python-multipart==0.0.20
 pytesseract==0.3.13
 Pillow==11.1.0
 pypdf==5.4.0

 pydantic==2.10.6
 pydantic-settings==2.7.1
 requests==2.32.3
 python-multipart==0.0.20
+transformers==4.46.0
+torch==2.5.1
+accelerate==1.1.1
+sentencepiece==0.2.0
 pytesseract==0.3.13
 Pillow==11.1.0
 pypdf==5.4.0