Upload folder using huggingface_hub
Browse files- README.md +91 -0
- handler.py +96 -0
- model_loader.py +53 -0
- requirements.txt +4 -0
README.md
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
library_name: transformers
|
| 4 |
+
pipeline_tag: translation
|
| 5 |
+
base_model: facebook/nllb-200-distilled-600M
|
| 6 |
+
tags:
|
| 7 |
+
- translation
|
| 8 |
+
- nllb
|
| 9 |
+
- seq2seq
|
| 10 |
+
- endpoints-template
|
| 11 |
+
inference: true
|
| 12 |
+
language:
|
| 13 |
+
- multilingual
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# baseline-nllb
|
| 17 |
+
|
| 18 |
+
A baseline clone of [`facebook/nllb-200-distilled-600M`](https://huggingface.co/facebook/nllb-200-distilled-600M), packaged for **Hugging Face Inference Endpoints** with a custom handler so callers can pass arbitrary NLLB Flores-200 language codes at request time.
|
| 19 |
+
|
| 20 |
+
## Deploying to Inference Endpoints
|
| 21 |
+
|
| 22 |
+
1. Open this repo on the Hub and click **Deploy → Inference Endpoints**.
|
| 23 |
+
2. Pick a GPU instance (the 600M model runs fine on a small GPU; a CPU instance also works but is slower).
|
| 24 |
+
3. Leave the container type as **Default** — the Endpoints runtime will auto-detect [`handler.py`](./handler.py) and install [`requirements.txt`](./requirements.txt).
|
| 25 |
+
4. Deploy.
|
| 26 |
+
|
| 27 |
+
## Request format
|
| 28 |
+
|
| 29 |
+
```json
|
| 30 |
+
{
|
| 31 |
+
"inputs": "Hello, world!",
|
| 32 |
+
"parameters": {
|
| 33 |
+
"src_lang": "eng_Latn",
|
| 34 |
+
"tgt_lang": "spa_Latn",
|
| 35 |
+
"max_length": 256,
|
| 36 |
+
"num_beams": 4
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
`inputs` may be a single string or a list of strings. `src_lang` / `tgt_lang` use the [Flores-200 codes](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) (e.g. `eng_Latn`, `spa_Latn`, `fra_Latn`, `zho_Hans`, `arb_Arab`). If omitted, the handler defaults to `eng_Latn` → `spa_Latn`.
|
| 42 |
+
|
| 43 |
+
### Response
|
| 44 |
+
|
| 45 |
+
```json
|
| 46 |
+
[{ "translation_text": "¡Hola, mundo!" }]
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Example clients
|
| 50 |
+
|
| 51 |
+
### cURL
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
curl https://<your-endpoint>.endpoints.huggingface.cloud \
|
| 55 |
+
-H "Authorization: Bearer $HF_TOKEN" \
|
| 56 |
+
-H "Content-Type: application/json" \
|
| 57 |
+
-d '{
|
| 58 |
+
"inputs": "Hello, world!",
|
| 59 |
+
"parameters": { "src_lang": "eng_Latn", "tgt_lang": "fra_Latn" }
|
| 60 |
+
}'
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### Python
|
| 64 |
+
|
| 65 |
+
```python
|
| 66 |
+
import requests
|
| 67 |
+
|
| 68 |
+
resp = requests.post(
|
| 69 |
+
"https://<your-endpoint>.endpoints.huggingface.cloud",
|
| 70 |
+
headers={"Authorization": f"Bearer {HF_TOKEN}"},
|
| 71 |
+
json={
|
| 72 |
+
"inputs": ["Hello, world!", "How are you?"],
|
| 73 |
+
"parameters": {"src_lang": "eng_Latn", "tgt_lang": "deu_Latn"},
|
| 74 |
+
},
|
| 75 |
+
timeout=30,
|
| 76 |
+
)
|
| 77 |
+
print(resp.json())
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## Files in this repo
|
| 81 |
+
|
| 82 |
+
| File | Purpose |
|
| 83 |
+
| --- | --- |
|
| 84 |
+
| `handler.py` | Custom `EndpointHandler` used by HF Inference Endpoints. |
|
| 85 |
+
| `requirements.txt` | Extra Python deps installed into the endpoint container. |
|
| 86 |
+
| `model_loader.py` | One-off script that pushed the base NLLB weights to this repo. |
|
| 87 |
+
| `config.json`, `tokenizer*`, `*.safetensors` | Model + tokenizer artifacts (pushed by `model_loader.py`). |
|
| 88 |
+
|
| 89 |
+
## License
|
| 90 |
+
|
| 91 |
+
Inherits `CC-BY-NC-4.0` from the upstream `facebook/nllb-200-distilled-600M` model — **non-commercial use only**.
|
handler.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Custom inference handler for Hugging Face Inference Endpoints.
|
| 2 |
+
|
| 3 |
+
NLLB needs a source-language code on the tokenizer and a forced BOS token
|
| 4 |
+
id for the target language at generation time, so the default translation
|
| 5 |
+
pipeline is not flexible enough. This handler accepts `src_lang` and
|
| 6 |
+
`tgt_lang` (NLLB Flores-200 codes, e.g. "eng_Latn", "spa_Latn") per
|
| 7 |
+
request.
|
| 8 |
+
|
| 9 |
+
Request format:
|
| 10 |
+
{
|
| 11 |
+
"inputs": "Hello, world!", # str or List[str]
|
| 12 |
+
"parameters": {
|
| 13 |
+
"src_lang": "eng_Latn", # optional, default eng_Latn
|
| 14 |
+
"tgt_lang": "spa_Latn", # optional, default spa_Latn
|
| 15 |
+
"max_length": 256, # optional
|
| 16 |
+
"num_beams": 4, # optional
|
| 17 |
+
"temperature": 1.0, # optional
|
| 18 |
+
"do_sample": false # optional
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
Response: List[{"translation_text": str}]
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
from typing import Any, Dict, List, Union
|
| 28 |
+
|
| 29 |
+
import torch
|
| 30 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 31 |
+
|
| 32 |
+
DEFAULT_SRC_LANG = "eng_Latn"
|
| 33 |
+
DEFAULT_TGT_LANG = "spa_Latn"
|
| 34 |
+
DEFAULT_MAX_LENGTH = 256
|
| 35 |
+
DEFAULT_NUM_BEAMS = 4
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class EndpointHandler:
|
| 39 |
+
def __init__(self, path: str = "") -> None:
|
| 40 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 41 |
+
# fp16 on GPU keeps latency and memory down; stay in fp32 on CPU for stability.
|
| 42 |
+
dtype = torch.float16 if self.device == "cuda" else torch.float32
|
| 43 |
+
|
| 44 |
+
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
| 45 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 46 |
+
path, torch_dtype=dtype
|
| 47 |
+
).to(self.device)
|
| 48 |
+
self.model.eval()
|
| 49 |
+
|
| 50 |
+
def __call__(
|
| 51 |
+
self, data: Dict[str, Any]
|
| 52 |
+
) -> List[Dict[str, str]]:
|
| 53 |
+
inputs: Union[str, List[str], None] = data.get("inputs")
|
| 54 |
+
if inputs is None:
|
| 55 |
+
return [{"error": "Missing 'inputs' field."}]
|
| 56 |
+
if isinstance(inputs, str):
|
| 57 |
+
inputs = [inputs]
|
| 58 |
+
if not all(isinstance(x, str) for x in inputs):
|
| 59 |
+
return [{"error": "'inputs' must be a string or a list of strings."}]
|
| 60 |
+
|
| 61 |
+
params: Dict[str, Any] = data.get("parameters") or {}
|
| 62 |
+
src_lang = params.get("src_lang", DEFAULT_SRC_LANG)
|
| 63 |
+
tgt_lang = params.get("tgt_lang", DEFAULT_TGT_LANG)
|
| 64 |
+
max_length = int(params.get("max_length", DEFAULT_MAX_LENGTH))
|
| 65 |
+
num_beams = int(params.get("num_beams", DEFAULT_NUM_BEAMS))
|
| 66 |
+
do_sample = bool(params.get("do_sample", False))
|
| 67 |
+
temperature = float(params.get("temperature", 1.0))
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
forced_bos_token_id = self.tokenizer.convert_tokens_to_ids(tgt_lang)
|
| 71 |
+
except Exception:
|
| 72 |
+
return [{"error": f"Unknown target language code: {tgt_lang!r}"}]
|
| 73 |
+
if forced_bos_token_id == self.tokenizer.unk_token_id:
|
| 74 |
+
return [{"error": f"Unknown target language code: {tgt_lang!r}"}]
|
| 75 |
+
|
| 76 |
+
self.tokenizer.src_lang = src_lang
|
| 77 |
+
encoded = self.tokenizer(
|
| 78 |
+
inputs,
|
| 79 |
+
return_tensors="pt",
|
| 80 |
+
padding=True,
|
| 81 |
+
truncation=True,
|
| 82 |
+
max_length=max_length,
|
| 83 |
+
).to(self.device)
|
| 84 |
+
|
| 85 |
+
with torch.inference_mode():
|
| 86 |
+
generated = self.model.generate(
|
| 87 |
+
**encoded,
|
| 88 |
+
forced_bos_token_id=forced_bos_token_id,
|
| 89 |
+
max_length=max_length,
|
| 90 |
+
num_beams=num_beams,
|
| 91 |
+
do_sample=do_sample,
|
| 92 |
+
temperature=temperature,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
|
| 96 |
+
return [{"translation_text": t} for t in decoded]
|
model_loader.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Download the base NLLB model and push a deploy-ready copy to the Hub.
|
| 2 |
+
|
| 3 |
+
Pushes both the weights/tokenizer and the Inference-Endpoints artifacts
|
| 4 |
+
(`handler.py`, `requirements.txt`, `README.md`) so a subsequent
|
| 5 |
+
"Deploy → Inference Endpoints" click on the Hub just works.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
huggingface-cli login # or set HF_TOKEN
|
| 9 |
+
python model_loader.py
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
from huggingface_hub import HfApi
|
| 17 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 18 |
+
|
| 19 |
+
BASE = "facebook/nllb-200-distilled-600M"
|
| 20 |
+
REPO = "Resilient-Coders/baseline-nllb"
|
| 21 |
+
|
| 22 |
+
ENDPOINT_FILES = ("handler.py", "requirements.txt", "README.md")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def push_weights() -> None:
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE)
|
| 27 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(BASE)
|
| 28 |
+
|
| 29 |
+
tokenizer.push_to_hub(REPO)
|
| 30 |
+
model.push_to_hub(REPO)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def push_endpoint_files() -> None:
|
| 34 |
+
api = HfApi()
|
| 35 |
+
api.create_repo(REPO, exist_ok=True)
|
| 36 |
+
repo_root = Path(__file__).resolve().parent
|
| 37 |
+
for name in ENDPOINT_FILES:
|
| 38 |
+
path = repo_root / name
|
| 39 |
+
if not path.exists():
|
| 40 |
+
print(f"[skip] {name} not found next to model_loader.py")
|
| 41 |
+
continue
|
| 42 |
+
api.upload_file(
|
| 43 |
+
path_or_fileobj=str(path),
|
| 44 |
+
path_in_repo=name,
|
| 45 |
+
repo_id=REPO,
|
| 46 |
+
commit_message=f"Update {name}",
|
| 47 |
+
)
|
| 48 |
+
print(f"[ok] uploaded {name}")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
push_weights()
|
| 53 |
+
push_endpoint_files()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers>=4.40.0
|
| 2 |
+
torch>=2.1.0
|
| 3 |
+
sentencepiece>=0.1.99
|
| 4 |
+
protobuf>=3.20.0
|