HPLT
/

hplt_bert_base_ms

@@ -41,8 +41,8 @@ This model currently needs a custom wrapper from `modeling_ltgbert.py`, you shou
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
-tokenizer = AutoTokenizer.from_pretrained("HPLT/hplt_bert_base_en")
-model = AutoModelForMaskedLM.from_pretrained("HPLT/hplt_bert_base_en", trust_remote_code=True)
 mask_id = tokenizer.convert_tokens_to_ids("[MASK]")
 input_text = tokenizer("It's a beautiful[MASK].", return_tensors="pt")
@@ -61,18 +61,38 @@ We are releasing 10 intermediate checkpoints for each model at intervals of ever
 You can load a specific model revision with `transformers` using the argument `revision`:
 ```python
-model = AutoModelForMaskedLM.from_pretrained("HPLT/hplt_bert_base_en", revision="step21875", trust_remote_code=True)
 ```
 You can access all the revisions for the models with the following code:
 ```python
 from huggingface_hub import list_repo_refs
-out = list_repo_refs("HPLT/hplt_bert_base_en")
 print([b.name for b in out.branches])
 ```
 ## Cite us
 ```bibtex
 @inproceedings{de-gibert-etal-2024-new-massive,
     title = "A New Massive Multilingual Dataset for High-Performance Language Technologies",

 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
+tokenizer = AutoTokenizer.from_pretrained("HPLT/hplt_bert_base_ms")
+model = AutoModelForMaskedLM.from_pretrained("HPLT/hplt_bert_base_ms", trust_remote_code=True)
 mask_id = tokenizer.convert_tokens_to_ids("[MASK]")
 input_text = tokenizer("It's a beautiful[MASK].", return_tensors="pt")
 You can load a specific model revision with `transformers` using the argument `revision`:
 ```python
+model = AutoModelForMaskedLM.from_pretrained("HPLT/hplt_bert_base_ms", revision="step21875", trust_remote_code=True)
 ```
 You can access all the revisions for the models with the following code:
 ```python
 from huggingface_hub import list_repo_refs
+out = list_repo_refs("HPLT/hplt_bert_base_ms")
 print([b.name for b in out.branches])
 ```
 ## Cite us
+```bibtex
+@inproceedings{samuel-etal-2023-trained,
+    title = "Trained on 100 million words and still in shape: {BERT} meets {B}ritish {N}ational {C}orpus",
+    author = "Samuel, David  and
+      Kutuzov, Andrey  and
+      {\O}vrelid, Lilja  and
+      Velldal, Erik",
+    editor = "Vlachos, Andreas  and
+      Augenstein, Isabelle",
+    booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
+    month = may,
+    year = "2023",
+    address = "Dubrovnik, Croatia",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2023.findings-eacl.146",
+    doi = "10.18653/v1/2023.findings-eacl.146",
+    pages = "1954--1974"
+})
+```
 ```bibtex
 @inproceedings{de-gibert-etal-2024-new-massive,
     title = "A New Massive Multilingual Dataset for High-Performance Language Technologies",