Spaces:
Sleeping
Sleeping
Commit ·
efddb2f
1
Parent(s): 50231a8
feat: update classifier model configuration and remove external dependencies
Browse files- .env.example +1 -2
- .qwen/settings.json +9 -0
- .qwen/settings.json.orig +7 -0
- Dockerfile +2 -0
- README.md +2 -1
- app/core/config.py +1 -2
- app/services/classifier_service.py +17 -61
- requirements.txt +5 -1
.env.example
CHANGED
|
@@ -5,8 +5,7 @@ DEBUG=false
|
|
| 5 |
STATIC_DIR=static
|
| 6 |
UPLOAD_SUBDIR=uploads
|
| 7 |
|
| 8 |
-
|
| 9 |
-
CLASSIFIER_API_NAME=/predict
|
| 10 |
HUGGINGFACE_TOKEN=
|
| 11 |
|
| 12 |
LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
|
|
|
|
| 5 |
STATIC_DIR=static
|
| 6 |
UPLOAD_SUBDIR=uploads
|
| 7 |
|
| 8 |
+
CLASSIFIER_MODEL=AyoubChLin/distilbert_cnn_news
|
|
|
|
| 9 |
HUGGINGFACE_TOKEN=
|
| 10 |
|
| 11 |
LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
|
.qwen/settings.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"permissions": {
|
| 3 |
+
"allow": [
|
| 4 |
+
"Bash(pip install *)",
|
| 5 |
+
"Bash(pip3 install *)"
|
| 6 |
+
]
|
| 7 |
+
},
|
| 8 |
+
"$version": 3
|
| 9 |
+
}
|
.qwen/settings.json.orig
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"permissions": {
|
| 3 |
+
"allow": [
|
| 4 |
+
"Bash(pip install *)"
|
| 5 |
+
]
|
| 6 |
+
}
|
| 7 |
+
}
|
Dockerfile
CHANGED
|
@@ -12,6 +12,8 @@ RUN apt-get update \
|
|
| 12 |
COPY requirements.txt .
|
| 13 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
|
|
|
|
|
|
|
| 15 |
COPY . .
|
| 16 |
|
| 17 |
EXPOSE 4002
|
|
|
|
| 12 |
COPY requirements.txt .
|
| 13 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
|
| 15 |
+
RUN huggingface-cli login --token ${HUGGINGFACE_TOKEN} 2>/dev/null || true
|
| 16 |
+
|
| 17 |
COPY . .
|
| 18 |
|
| 19 |
EXPOSE 4002
|
README.md
CHANGED
|
@@ -39,7 +39,7 @@ cp .env.example .env
|
|
| 39 |
```
|
| 40 |
|
| 41 |
Key vars:
|
| 42 |
-
- `
|
| 43 |
- `HUGGINGFACE_TOKEN`
|
| 44 |
- `LANGUAGE_DETECTOR_URL`
|
| 45 |
- `DEFAULT_LABELS_CSV`
|
|
@@ -63,3 +63,4 @@ pytest -q
|
|
| 63 |
## Notes
|
| 64 |
- OCR requires `tesseract-ocr` (installed in Dockerfile).
|
| 65 |
- Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
|
|
|
|
|
|
| 39 |
```
|
| 40 |
|
| 41 |
Key vars:
|
| 42 |
+
- `CLASSIFIER_MODEL`
|
| 43 |
- `HUGGINGFACE_TOKEN`
|
| 44 |
- `LANGUAGE_DETECTOR_URL`
|
| 45 |
- `DEFAULT_LABELS_CSV`
|
|
|
|
| 63 |
## Notes
|
| 64 |
- OCR requires `tesseract-ocr` (installed in Dockerfile).
|
| 65 |
- Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
|
| 66 |
+
- The classifier model is loaded directly from Hugging Face Hub (no external Space dependency).
|
app/core/config.py
CHANGED
|
@@ -15,8 +15,7 @@ class Settings(BaseSettings):
|
|
| 15 |
static_dir: Path = Path("static")
|
| 16 |
upload_subdir: str = "uploads"
|
| 17 |
|
| 18 |
-
|
| 19 |
-
classifier_api_name: str = "/predict"
|
| 20 |
huggingface_token: str | None = None
|
| 21 |
|
| 22 |
language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
|
|
|
|
| 15 |
static_dir: Path = Path("static")
|
| 16 |
upload_subdir: str = "uploads"
|
| 17 |
|
| 18 |
+
classifier_model: str = "AyoubChLin/distilbert_cnn_news"
|
|
|
|
| 19 |
huggingface_token: str | None = None
|
| 20 |
|
| 21 |
language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
|
app/services/classifier_service.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
-
import json
|
| 2 |
-
from pathlib import Path
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
-
from
|
| 6 |
|
| 7 |
from app.core.config import settings
|
| 8 |
from app.core.exceptions import ClassificationError
|
|
@@ -10,72 +8,30 @@ from app.core.exceptions import ClassificationError
|
|
| 10 |
|
| 11 |
class ClassifierService:
|
| 12 |
def __init__(self) -> None:
|
| 13 |
-
self.
|
| 14 |
-
|
| 15 |
-
def
|
| 16 |
-
if self.
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
except Exception as exc:
|
| 26 |
-
raise ClassificationError("Unable to initialize classifier client") from exc
|
| 27 |
-
|
| 28 |
-
return self._client
|
| 29 |
-
|
| 30 |
-
@staticmethod
|
| 31 |
-
def _extract_label(payload: Any) -> str | None:
|
| 32 |
-
if isinstance(payload, dict):
|
| 33 |
-
value = payload.get("label")
|
| 34 |
-
if isinstance(value, str) and value.strip():
|
| 35 |
-
return value.strip()
|
| 36 |
-
return None
|
| 37 |
-
|
| 38 |
-
if isinstance(payload, list):
|
| 39 |
-
for item in payload:
|
| 40 |
-
label = ClassifierService._extract_label(item)
|
| 41 |
-
if label:
|
| 42 |
-
return label
|
| 43 |
-
|
| 44 |
-
return None
|
| 45 |
|
| 46 |
def classify(self, text: str, labels: list[str]) -> str:
|
| 47 |
if not labels:
|
| 48 |
raise ClassificationError("No labels configured")
|
| 49 |
|
| 50 |
-
labels_text = ", ".join(labels)
|
| 51 |
-
|
| 52 |
try:
|
| 53 |
-
result = self.
|
| 54 |
-
text,
|
| 55 |
-
labels_text,
|
| 56 |
-
api_name=settings.classifier_api_name,
|
| 57 |
-
)
|
| 58 |
except Exception as exc:
|
| 59 |
-
raise ClassificationError("Classifier
|
| 60 |
-
|
| 61 |
-
if isinstance(result, str):
|
| 62 |
-
candidate_path = Path(result)
|
| 63 |
-
if candidate_path.exists():
|
| 64 |
-
try:
|
| 65 |
-
parsed = json.loads(candidate_path.read_text(encoding="utf-8"))
|
| 66 |
-
except Exception as exc:
|
| 67 |
-
raise ClassificationError("Classifier output file is not valid JSON") from exc
|
| 68 |
-
label = self._extract_label(parsed)
|
| 69 |
-
if label:
|
| 70 |
-
return label
|
| 71 |
-
|
| 72 |
-
stripped = result.strip()
|
| 73 |
-
if stripped:
|
| 74 |
-
return stripped
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
return label
|
| 79 |
|
| 80 |
raise ClassificationError("Classifier did not return a valid label")
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
+
from transformers import pipeline
|
| 4 |
|
| 5 |
from app.core.config import settings
|
| 6 |
from app.core.exceptions import ClassificationError
|
|
|
|
| 8 |
|
| 9 |
class ClassifierService:
|
| 10 |
def __init__(self) -> None:
|
| 11 |
+
self._pipeline: Any | None = None
|
| 12 |
+
|
| 13 |
+
def _get_pipeline(self) -> Any:
|
| 14 |
+
if self._pipeline is None:
|
| 15 |
+
try:
|
| 16 |
+
self._pipeline = pipeline(
|
| 17 |
+
"zero-shot-classification",
|
| 18 |
+
model=settings.classifier_model,
|
| 19 |
+
)
|
| 20 |
+
except Exception as exc:
|
| 21 |
+
raise ClassificationError("Unable to initialize classifier pipeline") from exc
|
| 22 |
+
return self._pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def classify(self, text: str, labels: list[str]) -> str:
|
| 25 |
if not labels:
|
| 26 |
raise ClassificationError("No labels configured")
|
| 27 |
|
|
|
|
|
|
|
| 28 |
try:
|
| 29 |
+
result = self._get_pipeline()(text, labels, multi_label=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
except Exception as exc:
|
| 31 |
+
raise ClassificationError("Classifier prediction failed") from exc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
if isinstance(result, dict) and "labels" in result and result["labels"]:
|
| 34 |
+
return result["labels"][0]
|
|
|
|
| 35 |
|
| 36 |
raise ClassificationError("Classifier did not return a valid label")
|
| 37 |
|
requirements.txt
CHANGED
|
@@ -3,9 +3,13 @@ uvicorn[standard]==0.34.0
|
|
| 3 |
pydantic==2.10.6
|
| 4 |
pydantic-settings==2.7.1
|
| 5 |
requests==2.32.3
|
| 6 |
-
gradio_client==1.7.0
|
| 7 |
python-multipart==0.0.20
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
pytesseract==0.3.13
|
| 10 |
Pillow==11.1.0
|
| 11 |
pypdf==5.4.0
|
|
|
|
| 3 |
pydantic==2.10.6
|
| 4 |
pydantic-settings==2.7.1
|
| 5 |
requests==2.32.3
|
|
|
|
| 6 |
python-multipart==0.0.20
|
| 7 |
|
| 8 |
+
transformers==4.46.0
|
| 9 |
+
torch==2.5.1
|
| 10 |
+
accelerate==1.1.1
|
| 11 |
+
sentencepiece==0.2.0
|
| 12 |
+
|
| 13 |
pytesseract==0.3.13
|
| 14 |
Pillow==11.1.0
|
| 15 |
pypdf==5.4.0
|