Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +91 -0
handler.py +96 -0
model_loader.py +53 -0
requirements.txt +4 -0

README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+license: cc-by-nc-4.0
+library_name: transformers
+pipeline_tag: translation
+base_model: facebook/nllb-200-distilled-600M
+tags:
+  - translation
+  - nllb
+  - seq2seq
+  - endpoints-template
+inference: true
+language:
+  - multilingual
+---
+# baseline-nllb
+A baseline clone of [`facebook/nllb-200-distilled-600M`](https://huggingface.co/facebook/nllb-200-distilled-600M), packaged for **Hugging Face Inference Endpoints** with a custom handler so callers can pass arbitrary NLLB Flores-200 language codes at request time.
+## Deploying to Inference Endpoints
+1. Open this repo on the Hub and click **Deploy → Inference Endpoints**.
+2. Pick a GPU instance (the 600M model runs fine on a small GPU; a CPU instance also works but is slower).
+3. Leave the container type as **Default** — the Endpoints runtime will auto-detect [`handler.py`](./handler.py) and install [`requirements.txt`](./requirements.txt).
+4. Deploy.
+## Request format
+```json
+{
+  "inputs": "Hello, world!",
+  "parameters": {
+    "src_lang": "eng_Latn",
+    "tgt_lang": "spa_Latn",
+    "max_length": 256,
+    "num_beams": 4
+  }
+}
+```
+`inputs` may be a single string or a list of strings. `src_lang` / `tgt_lang` use the [Flores-200 codes](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) (e.g. `eng_Latn`, `spa_Latn`, `fra_Latn`, `zho_Hans`, `arb_Arab`). If omitted, the handler defaults to `eng_Latn` → `spa_Latn`.
+### Response
+```json
+[{ "translation_text": "¡Hola, mundo!" }]
+```
+## Example clients
+### cURL
+```bash
+curl https://<your-endpoint>.endpoints.huggingface.cloud \
+  -H "Authorization: Bearer $HF_TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{
+        "inputs": "Hello, world!",
+        "parameters": { "src_lang": "eng_Latn", "tgt_lang": "fra_Latn" }
+      }'
+```
+### Python
+```python
+import requests
+resp = requests.post(
+    "https://<your-endpoint>.endpoints.huggingface.cloud",
+    headers={"Authorization": f"Bearer {HF_TOKEN}"},
+    json={
+        "inputs": ["Hello, world!", "How are you?"],
+        "parameters": {"src_lang": "eng_Latn", "tgt_lang": "deu_Latn"},
+    },
+    timeout=30,
+)
+print(resp.json())
+```
+## Files in this repo
+| File | Purpose |
+| --- | --- |
+| `handler.py` | Custom `EndpointHandler` used by HF Inference Endpoints. |
+| `requirements.txt` | Extra Python deps installed into the endpoint container. |
+| `model_loader.py` | One-off script that pushed the base NLLB weights to this repo. |
+| `config.json`, `tokenizer*`, `*.safetensors` | Model + tokenizer artifacts (pushed by `model_loader.py`). |
+## License
+Inherits `CC-BY-NC-4.0` from the upstream `facebook/nllb-200-distilled-600M` model — **non-commercial use only**.

handler.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Custom inference handler for Hugging Face Inference Endpoints.
+NLLB needs a source-language code on the tokenizer and a forced BOS token
+id for the target language at generation time, so the default translation
+pipeline is not flexible enough. This handler accepts `src_lang` and
+`tgt_lang` (NLLB Flores-200 codes, e.g. "eng_Latn", "spa_Latn") per
+request.
+Request format:
+    {
+      "inputs": "Hello, world!",            # str or List[str]
+      "parameters": {
+        "src_lang": "eng_Latn",             # optional, default eng_Latn
+        "tgt_lang": "spa_Latn",             # optional, default spa_Latn
+        "max_length": 256,                  # optional
+        "num_beams": 4,                     # optional
+        "temperature": 1.0,                 # optional
+        "do_sample": false                  # optional
+      }
+    }
+Response: List[{"translation_text": str}]
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Union
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+DEFAULT_SRC_LANG = "eng_Latn"
+DEFAULT_TGT_LANG = "spa_Latn"
+DEFAULT_MAX_LENGTH = 256
+DEFAULT_NUM_BEAMS = 4
+class EndpointHandler:
+    def __init__(self, path: str = "") -> None:
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # fp16 on GPU keeps latency and memory down; stay in fp32 on CPU for stability.
+        dtype = torch.float16 if self.device == "cuda" else torch.float32
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(
+            path, torch_dtype=dtype
+        ).to(self.device)
+        self.model.eval()
+    def __call__(
+        self, data: Dict[str, Any]
+    ) -> List[Dict[str, str]]:
+        inputs: Union[str, List[str], None] = data.get("inputs")
+        if inputs is None:
+            return [{"error": "Missing 'inputs' field."}]
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        if not all(isinstance(x, str) for x in inputs):
+            return [{"error": "'inputs' must be a string or a list of strings."}]
+        params: Dict[str, Any] = data.get("parameters") or {}
+        src_lang = params.get("src_lang", DEFAULT_SRC_LANG)
+        tgt_lang = params.get("tgt_lang", DEFAULT_TGT_LANG)
+        max_length = int(params.get("max_length", DEFAULT_MAX_LENGTH))
+        num_beams = int(params.get("num_beams", DEFAULT_NUM_BEAMS))
+        do_sample = bool(params.get("do_sample", False))
+        temperature = float(params.get("temperature", 1.0))
+        try:
+            forced_bos_token_id = self.tokenizer.convert_tokens_to_ids(tgt_lang)
+        except Exception:
+            return [{"error": f"Unknown target language code: {tgt_lang!r}"}]
+        if forced_bos_token_id == self.tokenizer.unk_token_id:
+            return [{"error": f"Unknown target language code: {tgt_lang!r}"}]
+        self.tokenizer.src_lang = src_lang
+        encoded = self.tokenizer(
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+        ).to(self.device)
+        with torch.inference_mode():
+            generated = self.model.generate(
+                **encoded,
+                forced_bos_token_id=forced_bos_token_id,
+                max_length=max_length,
+                num_beams=num_beams,
+                do_sample=do_sample,
+                temperature=temperature,
+            )
+        decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
+        return [{"translation_text": t} for t in decoded]

model_loader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Download the base NLLB model and push a deploy-ready copy to the Hub.
+Pushes both the weights/tokenizer and the Inference-Endpoints artifacts
+(`handler.py`, `requirements.txt`, `README.md`) so a subsequent
+"Deploy → Inference Endpoints" click on the Hub just works.
+Usage:
+    huggingface-cli login   # or set HF_TOKEN
+    python model_loader.py
+"""
+from __future__ import annotations
+from pathlib import Path
+from huggingface_hub import HfApi
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+BASE = "facebook/nllb-200-distilled-600M"
+REPO = "Resilient-Coders/baseline-nllb"
+ENDPOINT_FILES = ("handler.py", "requirements.txt", "README.md")
+def push_weights() -> None:
+    tokenizer = AutoTokenizer.from_pretrained(BASE)
+    model = AutoModelForSeq2SeqLM.from_pretrained(BASE)
+    tokenizer.push_to_hub(REPO)
+    model.push_to_hub(REPO)
+def push_endpoint_files() -> None:
+    api = HfApi()
+    api.create_repo(REPO, exist_ok=True)
+    repo_root = Path(__file__).resolve().parent
+    for name in ENDPOINT_FILES:
+        path = repo_root / name
+        if not path.exists():
+            print(f"[skip] {name} not found next to model_loader.py")
+            continue
+        api.upload_file(
+            path_or_fileobj=str(path),
+            path_in_repo=name,
+            repo_id=REPO,
+            commit_message=f"Update {name}",
+        )
+        print(f"[ok] uploaded {name}")
+if __name__ == "__main__":
+    push_weights()
+    push_endpoint_files()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers>=4.40.0
+torch>=2.1.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0