ericaRC commited on
Commit
b79dedc
·
verified ·
1 Parent(s): 6bcb83c

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +91 -0
  2. handler.py +96 -0
  3. model_loader.py +53 -0
  4. requirements.txt +4 -0
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ library_name: transformers
4
+ pipeline_tag: translation
5
+ base_model: facebook/nllb-200-distilled-600M
6
+ tags:
7
+ - translation
8
+ - nllb
9
+ - seq2seq
10
+ - endpoints-template
11
+ inference: true
12
+ language:
13
+ - multilingual
14
+ ---
15
+
16
+ # baseline-nllb
17
+
18
+ A baseline clone of [`facebook/nllb-200-distilled-600M`](https://huggingface.co/facebook/nllb-200-distilled-600M), packaged for **Hugging Face Inference Endpoints** with a custom handler so callers can pass arbitrary NLLB Flores-200 language codes at request time.
19
+
20
+ ## Deploying to Inference Endpoints
21
+
22
+ 1. Open this repo on the Hub and click **Deploy → Inference Endpoints**.
23
+ 2. Pick a GPU instance (the 600M model runs fine on a small GPU; a CPU instance also works but is slower).
24
+ 3. Leave the container type as **Default** — the Endpoints runtime will auto-detect [`handler.py`](./handler.py) and install [`requirements.txt`](./requirements.txt).
25
+ 4. Deploy.
26
+
27
+ ## Request format
28
+
29
+ ```json
30
+ {
31
+ "inputs": "Hello, world!",
32
+ "parameters": {
33
+ "src_lang": "eng_Latn",
34
+ "tgt_lang": "spa_Latn",
35
+ "max_length": 256,
36
+ "num_beams": 4
37
+ }
38
+ }
39
+ ```
40
+
41
+ `inputs` may be a single string or a list of strings. `src_lang` / `tgt_lang` use the [Flores-200 codes](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) (e.g. `eng_Latn`, `spa_Latn`, `fra_Latn`, `zho_Hans`, `arb_Arab`). If omitted, the handler defaults to `eng_Latn` → `spa_Latn`.
42
+
43
+ ### Response
44
+
45
+ ```json
46
+ [{ "translation_text": "¡Hola, mundo!" }]
47
+ ```
48
+
49
+ ## Example clients
50
+
51
+ ### cURL
52
+
53
+ ```bash
54
+ curl https://<your-endpoint>.endpoints.huggingface.cloud \
55
+ -H "Authorization: Bearer $HF_TOKEN" \
56
+ -H "Content-Type: application/json" \
57
+ -d '{
58
+ "inputs": "Hello, world!",
59
+ "parameters": { "src_lang": "eng_Latn", "tgt_lang": "fra_Latn" }
60
+ }'
61
+ ```
62
+
63
+ ### Python
64
+
65
+ ```python
66
+ import requests
67
+
68
+ resp = requests.post(
69
+ "https://<your-endpoint>.endpoints.huggingface.cloud",
70
+ headers={"Authorization": f"Bearer {HF_TOKEN}"},
71
+ json={
72
+ "inputs": ["Hello, world!", "How are you?"],
73
+ "parameters": {"src_lang": "eng_Latn", "tgt_lang": "deu_Latn"},
74
+ },
75
+ timeout=30,
76
+ )
77
+ print(resp.json())
78
+ ```
79
+
80
+ ## Files in this repo
81
+
82
+ | File | Purpose |
83
+ | --- | --- |
84
+ | `handler.py` | Custom `EndpointHandler` used by HF Inference Endpoints. |
85
+ | `requirements.txt` | Extra Python deps installed into the endpoint container. |
86
+ | `model_loader.py` | One-off script that pushed the base NLLB weights to this repo. |
87
+ | `config.json`, `tokenizer*`, `*.safetensors` | Model + tokenizer artifacts (pushed by `model_loader.py`). |
88
+
89
+ ## License
90
+
91
+ Inherits `CC-BY-NC-4.0` from the upstream `facebook/nllb-200-distilled-600M` model — **non-commercial use only**.
handler.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom inference handler for Hugging Face Inference Endpoints.
2
+
3
+ NLLB needs a source-language code on the tokenizer and a forced BOS token
4
+ id for the target language at generation time, so the default translation
5
+ pipeline is not flexible enough. This handler accepts `src_lang` and
6
+ `tgt_lang` (NLLB Flores-200 codes, e.g. "eng_Latn", "spa_Latn") per
7
+ request.
8
+
9
+ Request format:
10
+ {
11
+ "inputs": "Hello, world!", # str or List[str]
12
+ "parameters": {
13
+ "src_lang": "eng_Latn", # optional, default eng_Latn
14
+ "tgt_lang": "spa_Latn", # optional, default spa_Latn
15
+ "max_length": 256, # optional
16
+ "num_beams": 4, # optional
17
+ "temperature": 1.0, # optional
18
+ "do_sample": false # optional
19
+ }
20
+ }
21
+
22
+ Response: List[{"translation_text": str}]
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import Any, Dict, List, Union
28
+
29
+ import torch
30
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
31
+
32
+ DEFAULT_SRC_LANG = "eng_Latn"
33
+ DEFAULT_TGT_LANG = "spa_Latn"
34
+ DEFAULT_MAX_LENGTH = 256
35
+ DEFAULT_NUM_BEAMS = 4
36
+
37
+
38
+ class EndpointHandler:
39
+ def __init__(self, path: str = "") -> None:
40
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
41
+ # fp16 on GPU keeps latency and memory down; stay in fp32 on CPU for stability.
42
+ dtype = torch.float16 if self.device == "cuda" else torch.float32
43
+
44
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
45
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(
46
+ path, torch_dtype=dtype
47
+ ).to(self.device)
48
+ self.model.eval()
49
+
50
+ def __call__(
51
+ self, data: Dict[str, Any]
52
+ ) -> List[Dict[str, str]]:
53
+ inputs: Union[str, List[str], None] = data.get("inputs")
54
+ if inputs is None:
55
+ return [{"error": "Missing 'inputs' field."}]
56
+ if isinstance(inputs, str):
57
+ inputs = [inputs]
58
+ if not all(isinstance(x, str) for x in inputs):
59
+ return [{"error": "'inputs' must be a string or a list of strings."}]
60
+
61
+ params: Dict[str, Any] = data.get("parameters") or {}
62
+ src_lang = params.get("src_lang", DEFAULT_SRC_LANG)
63
+ tgt_lang = params.get("tgt_lang", DEFAULT_TGT_LANG)
64
+ max_length = int(params.get("max_length", DEFAULT_MAX_LENGTH))
65
+ num_beams = int(params.get("num_beams", DEFAULT_NUM_BEAMS))
66
+ do_sample = bool(params.get("do_sample", False))
67
+ temperature = float(params.get("temperature", 1.0))
68
+
69
+ try:
70
+ forced_bos_token_id = self.tokenizer.convert_tokens_to_ids(tgt_lang)
71
+ except Exception:
72
+ return [{"error": f"Unknown target language code: {tgt_lang!r}"}]
73
+ if forced_bos_token_id == self.tokenizer.unk_token_id:
74
+ return [{"error": f"Unknown target language code: {tgt_lang!r}"}]
75
+
76
+ self.tokenizer.src_lang = src_lang
77
+ encoded = self.tokenizer(
78
+ inputs,
79
+ return_tensors="pt",
80
+ padding=True,
81
+ truncation=True,
82
+ max_length=max_length,
83
+ ).to(self.device)
84
+
85
+ with torch.inference_mode():
86
+ generated = self.model.generate(
87
+ **encoded,
88
+ forced_bos_token_id=forced_bos_token_id,
89
+ max_length=max_length,
90
+ num_beams=num_beams,
91
+ do_sample=do_sample,
92
+ temperature=temperature,
93
+ )
94
+
95
+ decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
96
+ return [{"translation_text": t} for t in decoded]
model_loader.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Download the base NLLB model and push a deploy-ready copy to the Hub.
2
+
3
+ Pushes both the weights/tokenizer and the Inference-Endpoints artifacts
4
+ (`handler.py`, `requirements.txt`, `README.md`) so a subsequent
5
+ "Deploy → Inference Endpoints" click on the Hub just works.
6
+
7
+ Usage:
8
+ huggingface-cli login # or set HF_TOKEN
9
+ python model_loader.py
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from pathlib import Path
15
+
16
+ from huggingface_hub import HfApi
17
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
18
+
19
+ BASE = "facebook/nllb-200-distilled-600M"
20
+ REPO = "Resilient-Coders/baseline-nllb"
21
+
22
+ ENDPOINT_FILES = ("handler.py", "requirements.txt", "README.md")
23
+
24
+
25
+ def push_weights() -> None:
26
+ tokenizer = AutoTokenizer.from_pretrained(BASE)
27
+ model = AutoModelForSeq2SeqLM.from_pretrained(BASE)
28
+
29
+ tokenizer.push_to_hub(REPO)
30
+ model.push_to_hub(REPO)
31
+
32
+
33
+ def push_endpoint_files() -> None:
34
+ api = HfApi()
35
+ api.create_repo(REPO, exist_ok=True)
36
+ repo_root = Path(__file__).resolve().parent
37
+ for name in ENDPOINT_FILES:
38
+ path = repo_root / name
39
+ if not path.exists():
40
+ print(f"[skip] {name} not found next to model_loader.py")
41
+ continue
42
+ api.upload_file(
43
+ path_or_fileobj=str(path),
44
+ path_in_repo=name,
45
+ repo_id=REPO,
46
+ commit_message=f"Update {name}",
47
+ )
48
+ print(f"[ok] uploaded {name}")
49
+
50
+
51
+ if __name__ == "__main__":
52
+ push_weights()
53
+ push_endpoint_files()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers>=4.40.0
2
+ torch>=2.1.0
3
+ sentencepiece>=0.1.99
4
+ protobuf>=3.20.0