Upload 8 files
Browse files- .gitattributes +1 -0
- README.md +348 -0
- config.json +456 -0
- model.safetensors +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +14 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: mit
|
| 4 |
+
base_model: xlm-roberta-base
|
| 5 |
+
tags:
|
| 6 |
+
- generated_from_trainer
|
| 7 |
+
- language-identification
|
| 8 |
+
- codeswitching
|
| 9 |
+
metrics:
|
| 10 |
+
- precision
|
| 11 |
+
- recall
|
| 12 |
+
- f1
|
| 13 |
+
- accuracy
|
| 14 |
+
language:
|
| 15 |
+
- multilingual
|
| 16 |
+
- af
|
| 17 |
+
- am
|
| 18 |
+
- ar
|
| 19 |
+
- as
|
| 20 |
+
- ba
|
| 21 |
+
- be
|
| 22 |
+
- bg
|
| 23 |
+
- bn
|
| 24 |
+
- bo
|
| 25 |
+
- br
|
| 26 |
+
- bs
|
| 27 |
+
- ca
|
| 28 |
+
- ce
|
| 29 |
+
- ckb
|
| 30 |
+
- cs
|
| 31 |
+
- cy
|
| 32 |
+
- da
|
| 33 |
+
- de
|
| 34 |
+
- dv
|
| 35 |
+
- el
|
| 36 |
+
- en
|
| 37 |
+
- eo
|
| 38 |
+
- es
|
| 39 |
+
- et
|
| 40 |
+
- eu
|
| 41 |
+
- fa
|
| 42 |
+
- fi
|
| 43 |
+
- fr
|
| 44 |
+
- ga
|
| 45 |
+
- gd
|
| 46 |
+
- gl
|
| 47 |
+
- gu
|
| 48 |
+
- he
|
| 49 |
+
- hi
|
| 50 |
+
- hr
|
| 51 |
+
- hu
|
| 52 |
+
- hy
|
| 53 |
+
- id
|
| 54 |
+
- is
|
| 55 |
+
- it
|
| 56 |
+
- ja
|
| 57 |
+
- jv
|
| 58 |
+
- ka
|
| 59 |
+
- kk
|
| 60 |
+
- km
|
| 61 |
+
- kn
|
| 62 |
+
- ko
|
| 63 |
+
- ku
|
| 64 |
+
- ky
|
| 65 |
+
- la
|
| 66 |
+
- lb
|
| 67 |
+
- lo
|
| 68 |
+
- lt
|
| 69 |
+
- lv
|
| 70 |
+
- mg
|
| 71 |
+
- mk
|
| 72 |
+
- ml
|
| 73 |
+
- mn
|
| 74 |
+
- mr
|
| 75 |
+
- ms
|
| 76 |
+
- mt
|
| 77 |
+
- my
|
| 78 |
+
- ne
|
| 79 |
+
- nl
|
| 80 |
+
- 'no'
|
| 81 |
+
- ny
|
| 82 |
+
- oc
|
| 83 |
+
- om
|
| 84 |
+
- or
|
| 85 |
+
- pa
|
| 86 |
+
- pl
|
| 87 |
+
- ps
|
| 88 |
+
- pt
|
| 89 |
+
- rm
|
| 90 |
+
- ro
|
| 91 |
+
- ru
|
| 92 |
+
- sd
|
| 93 |
+
- si
|
| 94 |
+
- sk
|
| 95 |
+
- sl
|
| 96 |
+
- so
|
| 97 |
+
- sq
|
| 98 |
+
- sr
|
| 99 |
+
- su
|
| 100 |
+
- sv
|
| 101 |
+
- sw
|
| 102 |
+
- ta
|
| 103 |
+
- te
|
| 104 |
+
- tg
|
| 105 |
+
- th
|
| 106 |
+
- ti
|
| 107 |
+
- tl
|
| 108 |
+
- tr
|
| 109 |
+
- tt
|
| 110 |
+
- ug
|
| 111 |
+
- uk
|
| 112 |
+
- ur
|
| 113 |
+
- uz
|
| 114 |
+
- vi
|
| 115 |
+
- yo
|
| 116 |
+
- yi
|
| 117 |
+
- zh
|
| 118 |
+
- zu
|
| 119 |
+
model-index:
|
| 120 |
+
- name: polyglot-tagger
|
| 121 |
+
results: []
|
| 122 |
+
datasets:
|
| 123 |
+
- wikimedia/wikipedia
|
| 124 |
+
- HuggingFaceFW/finetranslations
|
| 125 |
+
- google/smol
|
| 126 |
+
- DerivedFunction/nlp-noise-snippets
|
| 127 |
+
- DerivedFunction/wikipedia-language-snippets-filtered
|
| 128 |
+
- DerivedFunction/finetranslations-filtered
|
| 129 |
+
- DerivedFunction/tatoeba-filtered
|
| 130 |
+
pipeline_tag: token-classification
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+

|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
Fine-tuned `xlm-roberta-base` for sentence-level language tagging across 100 languages.
|
| 138 |
+
The model predicts BIO-style language tags over tokens, which makes it useful for
|
| 139 |
+
language identification, code-switch detection, and multilingual document analysis.
|
| 140 |
+
|
| 141 |
+
> Compared to version 2.2, this version had training data that attempted to fix the model scoring common grade-school words from major langauges as low confidence or in a minor language bucket.
|
| 142 |
+
|
| 143 |
+
## Model description
|
| 144 |
+
|
| 145 |
+
Introducing Polyglot Tagger, a new way to classify multi-lingual documents. By training specifically on token classification on individual sentences, the model
|
| 146 |
+
generalizes well on a variety of languages, while also behaves as a multi-label classifier, and extracts sentences based on its language.
|
| 147 |
+
|
| 148 |
+
## Intended uses & limitations
|
| 149 |
+
This model can be treated as a base model for further fine-tuning on specific language identification extraction tasks.
|
| 150 |
+
Note that as a general language tagging model, it can potentially get confused from shared language families or from short texts. For example, Danish and Norwegian, Spanish and Portuguese, and Russian and Ukrainian.
|
| 151 |
+
|
| 152 |
+
The model is trained on a sentence with a minimum of four tokens, so it may not accurately classify very short and ambigous statements. Note that this model is experimental
|
| 153 |
+
and may produce unexpected results compared to generic text classifiers. It is trained on cleaned text, therefore, "messy" text may unexpectedly produce different results.
|
| 154 |
+
|
| 155 |
+
> Note that Romanized versions of any language may have no representation in the training set, such as Romanized Russian, and Hindi.
|
| 156 |
+
|
| 157 |
+
### Training and Evaluation Data
|
| 158 |
+
|
| 159 |
+
A synthetic training row consists of 1-6 individual and mostly independent sentences extracted from various sources. To generalize well against multiple languages, several
|
| 160 |
+
factors were used to simulate messy text, and to reduce single character bias on certain languages:
|
| 161 |
+
- Low chance of deliberate accent stripping for languages such as Spanish and Portugeuse
|
| 162 |
+
- Random chance to add in, replace or delete punctuation, numeric, and delimiter artifiacts
|
| 163 |
+
- Insert same-script alphabets to family language. For example, randomly injecting Arabic characters in Arabic languages
|
| 164 |
+
- Random chance to change the casing of compatible language scripts, such as Latin and Cyrllic.
|
| 165 |
+
- Low chance of simulating OCR and messy text with character mutation.
|
| 166 |
+
|
| 167 |
+
To generalize well on both the target language and code switching a curriculum is provided:
|
| 168 |
+
- Pure documents 55%: Single language to learn its vocabulary, simulating a short paragraph of a single language.
|
| 169 |
+
- Homogenous 25%: Single language + one foreign sentence to learn simple code switching.
|
| 170 |
+
- Spliced 10%: A foreign sentence is centered between two same-language sentence, with the first sentence's punctuation stripped, and second sentence's forced to be lowercased.
|
| 171 |
+
- Mixed 10%: Generic mix of any languages.
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
| lang | train sentences | train tokens | eval sentences | eval tokens | all sentences | all tokens |
|
| 177 |
+
| :--- | ---: | ---: | ---: | ---: | ---: | ---: |
|
| 178 |
+
| en | 423264 (2.41%) | 9841704 (1.71%) | 3157 (3.84%) | 35025 (1.79%) | 426421 (2.41%) | 9876729 (1.71%) |
|
| 179 |
+
| es | 359106 (2.04%) | 9729675 (1.69%) | 2201 (2.68%) | 22340 (1.14%) | 361307 (2.04%) | 9752015 (1.69%) |
|
| 180 |
+
| ru | 356083 (2.02%) | 8945224 (1.56%) | 2226 (2.71%) | 21578 (1.10%) | 358309 (2.03%) | 8966802 (1.55%) |
|
| 181 |
+
| fr | 354645 (2.02%) | 10591338 (1.84%) | 2213 (2.69%) | 26148 (1.34%) | 356858 (2.02%) | 10617486 (1.84%) |
|
| 182 |
+
| ja | 352243 (2.00%) | 7945312 (1.38%) | 2219 (2.70%) | 25849 (1.32%) | 354462 (2.01%) | 7971161 (1.38%) |
|
| 183 |
+
| pt | 346042 (1.97%) | 8793674 (1.53%) | 2059 (2.50%) | 20881 (1.07%) | 348101 (1.97%) | 8814555 (1.53%) |
|
| 184 |
+
| de | 344644 (1.96%) | 8847457 (1.54%) | 2151 (2.61%) | 24958 (1.27%) | 346795 (1.96%) | 8872415 (1.54%) |
|
| 185 |
+
| it | 343887 (1.95%) | 8790806 (1.53%) | 2000 (2.43%) | 17342 (0.89%) | 345887 (1.96%) | 8808148 (1.53%) |
|
| 186 |
+
| fi | 299568 (1.70%) | 6905536 (1.20%) | 1576 (1.92%) | 14458 (0.74%) | 301144 (1.70%) | 6919994 (1.20%) |
|
| 187 |
+
| uk | 297565 (1.69%) | 7228987 (1.26%) | 1398 (1.70%) | 11391 (0.58%) | 298963 (1.69%) | 7240378 (1.26%) |
|
| 188 |
+
| zh | 294064 (1.67%) | 7329413 (1.28%) | 1717 (2.09%) | 33433 (1.71%) | 295781 (1.67%) | 7362846 (1.28%) |
|
| 189 |
+
| tr | 289328 (1.64%) | 6606625 (1.15%) | 1384 (1.68%) | 11089 (0.57%) | 290712 (1.64%) | 6617714 (1.15%) |
|
| 190 |
+
| he | 289239 (1.64%) | 7792338 (1.36%) | 1100 (1.34%) | 10342 (0.53%) | 290339 (1.64%) | 7802680 (1.35%) |
|
| 191 |
+
| pl | 288423 (1.64%) | 7707293 (1.34%) | 1305 (1.59%) | 11306 (0.58%) | 289728 (1.64%) | 7718599 (1.34%) |
|
| 192 |
+
| hu | 286880 (1.63%) | 7547282 (1.31%) | 1232 (1.50%) | 11115 (0.57%) | 288112 (1.63%) | 7558397 (1.31%) |
|
| 193 |
+
| nl | 280682 (1.60%) | 6057971 (1.05%) | 1296 (1.58%) | 10453 (0.53%) | 281978 (1.60%) | 6068424 (1.05%) |
|
| 194 |
+
| lt | 273386 (1.55%) | 6507776 (1.13%) | 1165 (1.42%) | 10491 (0.54%) | 274551 (1.55%) | 6518267 (1.13%) |
|
| 195 |
+
| eo | 267885 (1.52%) | 6554654 (1.14%) | 1055 (1.28%) | 13271 (0.68%) | 268940 (1.52%) | 6567925 (1.14%) |
|
| 196 |
+
| ar | 257437 (1.46%) | 5125573 (0.89%) | 1327 (1.61%) | 15180 (0.78%) | 258764 (1.46%) | 5140753 (0.89%) |
|
| 197 |
+
| cs | 240507 (1.37%) | 6406086 (1.11%) | 1082 (1.32%) | 9473 (0.48%) | 241589 (1.37%) | 6415559 (1.11%) |
|
| 198 |
+
| mk | 231103 (1.31%) | 6478376 (1.13%) | 953 (1.16%) | 7713 (0.39%) | 232056 (1.31%) | 6486089 (1.12%) |
|
| 199 |
+
| mr | 228596 (1.30%) | 5886608 (1.02%) | 776 (0.94%) | 6332 (0.32%) | 229372 (1.30%) | 5892940 (1.02%) |
|
| 200 |
+
| no | 223605 (1.27%) | 6137131 (1.07%) | 1396 (1.70%) | 40226 (2.05%) | 225001 (1.27%) | 6177357 (1.07%) |
|
| 201 |
+
| da | 222243 (1.26%) | 5375746 (0.94%) | 1201 (1.46%) | 10373 (0.53%) | 223444 (1.26%) | 5386119 (0.93%) |
|
| 202 |
+
| hy | 207937 (1.18%) | 6345675 (1.10%) | 791 (0.96%) | 9276 (0.47%) | 208728 (1.18%) | 6354951 (1.10%) |
|
| 203 |
+
| tl | 207674 (1.18%) | 5561702 (0.97%) | 1017 (1.24%) | 10926 (0.56%) | 208691 (1.18%) | 5572628 (0.97%) |
|
| 204 |
+
| hi | 206552 (1.17%) | 7796062 (1.36%) | 1079 (1.31%) | 47351 (2.42%) | 207631 (1.17%) | 7843413 (1.36%) |
|
| 205 |
+
| ko | 205625 (1.17%) | 6481034 (1.13%) | 1156 (1.41%) | 32355 (1.65%) | 206781 (1.17%) | 6513389 (1.13%) |
|
| 206 |
+
| el | 202334 (1.15%) | 7105554 (1.24%) | 826 (1.00%) | 13412 (0.68%) | 203160 (1.15%) | 7118966 (1.23%) |
|
| 207 |
+
| ro | 194999 (1.11%) | 6206913 (1.08%) | 820 (1.00%) | 14575 (0.74%) | 195819 (1.11%) | 6221488 (1.08%) |
|
| 208 |
+
| fa | 192050 (1.09%) | 5728246 (1.00%) | 696 (0.85%) | 14765 (0.75%) | 192746 (1.09%) | 5743011 (1.00%) |
|
| 209 |
+
| sk | 189330 (1.08%) | 5318617 (0.93%) | 873 (1.06%) | 20779 (1.06%) | 190203 (1.08%) | 5339396 (0.93%) |
|
| 210 |
+
| la | 188201 (1.07%) | 4591159 (0.80%) | 824 (1.00%) | 8557 (0.44%) | 189025 (1.07%) | 4599716 (0.80%) |
|
| 211 |
+
| bg | 187685 (1.07%) | 5860353 (1.02%) | 762 (0.93%) | 16804 (0.86%) | 188447 (1.07%) | 5877157 (1.02%) |
|
| 212 |
+
| be | 181543 (1.03%) | 6528657 (1.14%) | 869 (1.06%) | 25944 (1.32%) | 182412 (1.03%) | 6554601 (1.14%) |
|
| 213 |
+
| is | 180452 (1.03%) | 6146455 (1.07%) | 959 (1.17%) | 39591 (2.02%) | 181411 (1.03%) | 6186046 (1.07%) |
|
| 214 |
+
| lv | 179142 (1.02%) | 5867897 (1.02%) | 762 (0.93%) | 33481 (1.71%) | 179904 (1.02%) | 5901378 (1.02%) |
|
| 215 |
+
| ckb | 174282 (0.99%) | 7825141 (1.36%) | 667 (0.81%) | 28756 (1.47%) | 174949 (0.99%) | 7853897 (1.36%) |
|
| 216 |
+
| ms | 172573 (0.98%) | 4614764 (0.80%) | 815 (0.99%) | 24769 (1.26%) | 173388 (0.98%) | 4639533 (0.80%) |
|
| 217 |
+
| ka | 170876 (0.97%) | 5505127 (0.96%) | 673 (0.82%) | 20651 (1.05%) | 171549 (0.97%) | 5525778 (0.96%) |
|
| 218 |
+
| kk | 170695 (0.97%) | 5132560 (0.89%) | 676 (0.82%) | 18695 (0.95%) | 171371 (0.97%) | 5151255 (0.89%) |
|
| 219 |
+
| bn | 170721 (0.97%) | 6393448 (1.11%) | 441 (0.54%) | 14727 (0.75%) | 171162 (0.97%) | 6408175 (1.11%) |
|
| 220 |
+
| eu | 168462 (0.96%) | 5737310 (1.00%) | 746 (0.91%) | 37196 (1.90%) | 169208 (0.96%) | 5774506 (1.00%) |
|
| 221 |
+
| as | 168746 (0.96%) | 8564682 (1.49%) | 445 (0.54%) | 24444 (1.25%) | 169191 (0.96%) | 8589126 (1.49%) |
|
| 222 |
+
| mn | 167543 (0.95%) | 5678049 (0.99%) | 703 (0.85%) | 20347 (1.04%) | 168246 (0.95%) | 5698396 (0.99%) |
|
| 223 |
+
| ur | 165992 (0.94%) | 5361179 (0.93%) | 684 (0.83%) | 22622 (1.16%) | 166676 (0.94%) | 5383801 (0.93%) |
|
| 224 |
+
| oc | 165863 (0.94%) | 5735536 (1.00%) | 730 (0.89%) | 18599 (0.95%) | 166593 (0.94%) | 5754135 (1.00%) |
|
| 225 |
+
| ba | 164919 (0.94%) | 8387828 (1.46%) | 699 (0.85%) | 35927 (1.83%) | 165618 (0.94%) | 8423755 (1.46%) |
|
| 226 |
+
| th | 164429 (0.93%) | 5495248 (0.96%) | 649 (0.79%) | 22113 (1.13%) | 165078 (0.93%) | 5517361 (0.96%) |
|
| 227 |
+
| ky | 164374 (0.93%) | 5199548 (0.90%) | 683 (0.83%) | 18956 (0.97%) | 165057 (0.93%) | 5218504 (0.90%) |
|
| 228 |
+
| hr | 163828 (0.93%) | 5183677 (0.90%) | 711 (0.86%) | 33845 (1.73%) | 164539 (0.93%) | 5217522 (0.90%) |
|
| 229 |
+
| ps | 163238 (0.93%) | 4735113 (0.82%) | 674 (0.82%) | 18515 (0.95%) | 163912 (0.93%) | 4753628 (0.82%) |
|
| 230 |
+
| id | 163187 (0.93%) | 4025079 (0.70%) | 723 (0.88%) | 13371 (0.68%) | 163910 (0.93%) | 4038450 (0.70%) |
|
| 231 |
+
| pa | 162180 (0.92%) | 7621059 (1.33%) | 581 (0.71%) | 29036 (1.48%) | 162761 (0.92%) | 7650095 (1.33%) |
|
| 232 |
+
| sw | 161777 (0.92%) | 5013161 (0.87%) | 653 (0.79%) | 26493 (1.35%) | 162430 (0.92%) | 5039654 (0.87%) |
|
| 233 |
+
| af | 160455 (0.91%) | 4676798 (0.81%) | 932 (1.13%) | 27369 (1.40%) | 161387 (0.91%) | 4704167 (0.82%) |
|
| 234 |
+
| jv | 156292 (0.89%) | 4752381 (0.83%) | 576 (0.70%) | 22573 (1.15%) | 156868 (0.89%) | 4774954 (0.83%) |
|
| 235 |
+
| tt | 154833 (0.88%) | 5165763 (0.90%) | 578 (0.70%) | 7298 (0.37%) | 155411 (0.88%) | 5173061 (0.90%) |
|
| 236 |
+
| cy | 153551 (0.87%) | 5656404 (0.98%) | 653 (0.79%) | 29503 (1.51%) | 154204 (0.87%) | 5685907 (0.99%) |
|
| 237 |
+
| ga | 150458 (0.86%) | 5488243 (0.95%) | 680 (0.83%) | 33471 (1.71%) | 151138 (0.86%) | 5521714 (0.96%) |
|
| 238 |
+
| kn | 150184 (0.85%) | 14992479 (2.61%) | 697 (0.85%) | 49288 (2.52%) | 150881 (0.85%) | 15041767 (2.61%) |
|
| 239 |
+
| bs | 150037 (0.85%) | 4582900 (0.80%) | 649 (0.79%) | 25588 (1.31%) | 150686 (0.85%) | 4608488 (0.80%) |
|
| 240 |
+
| ca | 149401 (0.85%) | 5477662 (0.95%) | 629 (0.76%) | 21391 (1.09%) | 150030 (0.85%) | 5499053 (0.95%) |
|
| 241 |
+
| ne | 148716 (0.85%) | 4855198 (0.84%) | 535 (0.65%) | 16246 (0.83%) | 149251 (0.84%) | 4871444 (0.84%) |
|
| 242 |
+
| ku | 147702 (0.84%) | 4973601 (0.87%) | 574 (0.70%) | 28196 (1.44%) | 148276 (0.84%) | 5001797 (0.87%) |
|
| 243 |
+
| gl | 147011 (0.84%) | 4554907 (0.79%) | 658 (0.80%) | 20127 (1.03%) | 147669 (0.84%) | 4575034 (0.79%) |
|
| 244 |
+
| uz | 145433 (0.83%) | 4704898 (0.82%) | 573 (0.70%) | 21862 (1.12%) | 146006 (0.83%) | 4726760 (0.82%) |
|
| 245 |
+
| sl | 144084 (0.82%) | 3851696 (0.67%) | 651 (0.79%) | 18164 (0.93%) | 144735 (0.82%) | 3869860 (0.67%) |
|
| 246 |
+
| sv | 143041 (0.81%) | 4006332 (0.70%) | 905 (1.10%) | 7012 (0.36%) | 143946 (0.81%) | 4013344 (0.70%) |
|
| 247 |
+
| tg | 136703 (0.78%) | 7664329 (1.33%) | 572 (0.70%) | 34220 (1.75%) | 137275 (0.78%) | 7698549 (1.33%) |
|
| 248 |
+
| et | 131007 (0.74%) | 3280590 (0.57%) | 549 (0.67%) | 14021 (0.72%) | 131556 (0.74%) | 3294611 (0.57%) |
|
| 249 |
+
| br | 130223 (0.74%) | 4495403 (0.78%) | 546 (0.66%) | 17304 (0.88%) | 130769 (0.74%) | 4512707 (0.78%) |
|
| 250 |
+
| lb | 129528 (0.74%) | 4421411 (0.77%) | 495 (0.60%) | 17761 (0.91%) | 130023 (0.74%) | 4439172 (0.77%) |
|
| 251 |
+
| su | 129144 (0.73%) | 4215719 (0.73%) | 535 (0.65%) | 21391 (1.09%) | 129679 (0.73%) | 4237110 (0.73%) |
|
| 252 |
+
| mt | 128626 (0.73%) | 6671441 (1.16%) | 508 (0.62%) | 26729 (1.36%) | 129134 (0.73%) | 6698170 (1.16%) |
|
| 253 |
+
| sq | 119431 (0.68%) | 4107917 (0.71%) | 561 (0.68%) | 18633 (0.95%) | 119992 (0.68%) | 4126550 (0.72%) |
|
| 254 |
+
| sr | 117855 (0.67%) | 3160599 (0.55%) | 427 (0.52%) | 3505 (0.18%) | 118282 (0.67%) | 3164104 (0.55%) |
|
| 255 |
+
| or | 110709 (0.63%) | 3922431 (0.68%) | 410 (0.50%) | 13094 (0.67%) | 111119 (0.63%) | 3935525 (0.68%) |
|
| 256 |
+
| ml | 110085 (0.63%) | 10929013 (1.90%) | 464 (0.56%) | 36922 (1.89%) | 110549 (0.63%) | 10965935 (1.90%) |
|
| 257 |
+
| yi | 104494 (0.59%) | 4085563 (0.71%) | 400 (0.49%) | 6005 (0.31%) | 104894 (0.59%) | 4091568 (0.71%) |
|
| 258 |
+
| te | 101076 (0.57%) | 9757033 (1.70%) | 430 (0.52%) | 37897 (1.94%) | 101506 (0.57%) | 9794930 (1.70%) |
|
| 259 |
+
| ta | 94122 (0.53%) | 7917169 (1.38%) | 386 (0.47%) | 26610 (1.36%) | 94508 (0.53%) | 7943779 (1.38%) |
|
| 260 |
+
| mg | 93939 (0.53%) | 3291017 (0.57%) | 391 (0.48%) | 11698 (0.60%) | 94330 (0.53%) | 3302715 (0.57%) |
|
| 261 |
+
| si | 92723 (0.53%) | 5275463 (0.92%) | 364 (0.44%) | 18426 (0.94%) | 93087 (0.53%) | 5293889 (0.92%) |
|
| 262 |
+
| vi | 74916 (0.43%) | 2535825 (0.44%) | 335 (0.41%) | 3396 (0.17%) | 75251 (0.43%) | 2539221 (0.44%) |
|
| 263 |
+
| rm | 74806 (0.43%) | 2826708 (0.49%) | 318 (0.39%) | 12654 (0.65%) | 75124 (0.43%) | 2839362 (0.49%) |
|
| 264 |
+
| gu | 70961 (0.40%) | 7859622 (1.37%) | 335 (0.41%) | 28389 (1.45%) | 71296 (0.40%) | 7888011 (1.37%) |
|
| 265 |
+
| bo | 69565 (0.40%) | 1378245 (0.24%) | 263 (0.32%) | 5407 (0.28%) | 69828 (0.40%) | 1383652 (0.24%) |
|
| 266 |
+
| ug | 64297 (0.37%) | 1427585 (0.25%) | 260 (0.32%) | 4769 (0.24%) | 64557 (0.37%) | 1432354 (0.25%) |
|
| 267 |
+
| dv | 60328 (0.34%) | 1557497 (0.27%) | 215 (0.26%) | 5844 (0.30%) | 60543 (0.34%) | 1563341 (0.27%) |
|
| 268 |
+
| am | 59339 (0.34%) | 2705311 (0.47%) | 235 (0.29%) | 10768 (0.55%) | 59574 (0.34%) | 2716079 (0.47%) |
|
| 269 |
+
| yo | 59246 (0.34%) | 3649130 (0.63%) | 260 (0.32%) | 21157 (1.08%) | 59506 (0.34%) | 3670287 (0.64%) |
|
| 270 |
+
| my | 58575 (0.33%) | 2165089 (0.38%) | 214 (0.26%) | 8142 (0.42%) | 58789 (0.33%) | 2173231 (0.38%) |
|
| 271 |
+
| km | 57081 (0.32%) | 3056236 (0.53%) | 193 (0.23%) | 10606 (0.54%) | 57274 (0.32%) | 3066842 (0.53%) |
|
| 272 |
+
| so | 56160 (0.32%) | 2044409 (0.36%) | 212 (0.26%) | 8847 (0.45%) | 56372 (0.32%) | 2053256 (0.36%) |
|
| 273 |
+
| sd | 55359 (0.31%) | 3226018 (0.56%) | 217 (0.26%) | 10847 (0.55%) | 55576 (0.31%) | 3236865 (0.56%) |
|
| 274 |
+
| zu | 52465 (0.30%) | 2406841 (0.42%) | 203 (0.25%) | 9751 (0.50%) | 52668 (0.30%) | 2416592 (0.42%) |
|
| 275 |
+
| lo | 50641 (0.29%) | 1747495 (0.30%) | 189 (0.23%) | 6221 (0.32%) | 50830 (0.29%) | 1753716 (0.30%) |
|
| 276 |
+
| ti | 47785 (0.27%) | 2895617 (0.50%) | 195 (0.24%) | 12699 (0.65%) | 47980 (0.27%) | 2908316 (0.50%) |
|
| 277 |
+
| ce | 45014 (0.26%) | 2425219 (0.42%) | 188 (0.23%) | 9950 (0.51%) | 45202 (0.26%) | 2435169 (0.42%) |
|
| 278 |
+
| ny | 43552 (0.25%) | 2051132 (0.36%) | 171 (0.21%) | 8286 (0.42%) | 43723 (0.25%) | 2059418 (0.36%) |
|
| 279 |
+
| gd | 36623 (0.21%) | 1273243 (0.22%) | 156 (0.19%) | 3615 (0.18%) | 36779 (0.21%) | 1276858 (0.22%) |
|
| 280 |
+
| xh | 24432 (0.14%) | 911850 (0.16%) | 93 (0.11%) | 3528 (0.18%) | 24525 (0.14%) | 915378 (0.16%) |
|
| 281 |
+
| om | 15372 (0.09%) | 545603 (0.09%) | 77 (0.09%) | 2564 (0.13%) | 15449 (0.09%) | 548167 (0.10%) |
|
| 282 |
+
| sco | 8772 (0.05%) | 233030 (0.04%) | 37 (0.04%) | 828 (0.04%) | 8809 (0.05%) | 233858 (0.04%) |
|
| 283 |
+
| **total** | 17593786 (100.00%) | 574735483 (100.00%) | 82270 (100.00%) | 1958217 (100.00%) | 17676056 (100.00%) | 576693700 (100.00%) |
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
This model is a fine-tuned version of [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) on an unknown dataset.
|
| 287 |
+
It achieves the following results on the evaluation set:
|
| 288 |
+
- Loss: 0.0306
|
| 289 |
+
- Precision: 0.9507
|
| 290 |
+
- Recall: 0.9644
|
| 291 |
+
- F1: 0.9575
|
| 292 |
+
- Accuracy: 0.9917
|
| 293 |
+
|
| 294 |
+
## Training procedure
|
| 295 |
+
|
| 296 |
+
### Training hyperparameters
|
| 297 |
+
|
| 298 |
+
The following hyperparameters were used during training:
|
| 299 |
+
- learning_rate: 5e-05
|
| 300 |
+
- train_batch_size: 72
|
| 301 |
+
- eval_batch_size: 36
|
| 302 |
+
- seed: 42
|
| 303 |
+
- gradient_accumulation_steps: 2
|
| 304 |
+
- total_train_batch_size: 144
|
| 305 |
+
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
| 306 |
+
- lr_scheduler_type: linear
|
| 307 |
+
- num_epochs: 2
|
| 308 |
+
- mixed_precision_training: Native AMP
|
| 309 |
+
|
| 310 |
+
### Training results
|
| 311 |
+
|
| 312 |
+
| Training Loss | Epoch | Step | Validation Loss | Precision | Recall | F1 | Accuracy |
|
| 313 |
+
|:-------------:|:------:|:-----:|:---------------:|:---------:|:------:|:------:|:--------:|
|
| 314 |
+
| 0.0918 | 0.0731 | 2500 | 0.1050 | 0.7984 | 0.8818 | 0.8381 | 0.9735 |
|
| 315 |
+
| 0.0717 | 0.1463 | 5000 | 0.0797 | 0.8393 | 0.9041 | 0.8705 | 0.9782 |
|
| 316 |
+
| 0.0624 | 0.2194 | 7500 | 0.0762 | 0.8664 | 0.9166 | 0.8908 | 0.9804 |
|
| 317 |
+
| 0.0562 | 0.2925 | 10000 | 0.0620 | 0.8758 | 0.9247 | 0.8995 | 0.9830 |
|
| 318 |
+
| 0.0516 | 0.3657 | 12500 | 0.0576 | 0.8844 | 0.9298 | 0.9065 | 0.9845 |
|
| 319 |
+
| 0.0465 | 0.4388 | 15000 | 0.0543 | 0.8993 | 0.9357 | 0.9172 | 0.9857 |
|
| 320 |
+
| 0.0433 | 0.5119 | 17500 | 0.0558 | 0.9005 | 0.9356 | 0.9177 | 0.9856 |
|
| 321 |
+
| 0.0411 | 0.5851 | 20000 | 0.0499 | 0.9012 | 0.9385 | 0.9195 | 0.9867 |
|
| 322 |
+
| 0.0420 | 0.6582 | 22500 | 0.0460 | 0.9167 | 0.9438 | 0.9300 | 0.9873 |
|
| 323 |
+
| 0.0392 | 0.7313 | 25000 | 0.0441 | 0.9149 | 0.9448 | 0.9296 | 0.9878 |
|
| 324 |
+
| 0.0386 | 0.8045 | 27500 | 0.0434 | 0.9200 | 0.9476 | 0.9336 | 0.9885 |
|
| 325 |
+
| 0.0357 | 0.8776 | 30000 | 0.0422 | 0.9235 | 0.9503 | 0.9367 | 0.9886 |
|
| 326 |
+
| 0.0356 | 0.9507 | 32500 | 0.0404 | 0.9272 | 0.9520 | 0.9395 | 0.9890 |
|
| 327 |
+
| 0.0261 | 1.0238 | 35000 | 0.0381 | 0.9293 | 0.9529 | 0.9409 | 0.9898 |
|
| 328 |
+
| 0.0322 | 1.0970 | 37500 | 0.0371 | 0.9346 | 0.9558 | 0.9451 | 0.9899 |
|
| 329 |
+
| 0.0303 | 1.1701 | 40000 | 0.0374 | 0.9375 | 0.9580 | 0.9476 | 0.9903 |
|
| 330 |
+
| 0.0276 | 1.2432 | 42500 | 0.0378 | 0.9355 | 0.9566 | 0.9460 | 0.9901 |
|
| 331 |
+
| 0.0264 | 1.3164 | 45000 | 0.0353 | 0.9373 | 0.9574 | 0.9472 | 0.9904 |
|
| 332 |
+
| 0.0228 | 1.3895 | 47500 | 0.0366 | 0.9398 | 0.9589 | 0.9493 | 0.9903 |
|
| 333 |
+
| 0.0234 | 1.4626 | 50000 | 0.0343 | 0.9430 | 0.9602 | 0.9516 | 0.9907 |
|
| 334 |
+
| 0.0274 | 1.5358 | 52500 | 0.0339 | 0.9396 | 0.9591 | 0.9492 | 0.9906 |
|
| 335 |
+
| 0.0236 | 1.6089 | 55000 | 0.0324 | 0.9438 | 0.9613 | 0.9525 | 0.9913 |
|
| 336 |
+
| 0.0244 | 1.6820 | 57500 | 0.0322 | 0.9478 | 0.9624 | 0.9551 | 0.9914 |
|
| 337 |
+
| 0.0222 | 1.7552 | 60000 | 0.0323 | 0.9483 | 0.9628 | 0.9555 | 0.9914 |
|
| 338 |
+
| 0.0238 | 1.8283 | 62500 | 0.0320 | 0.9480 | 0.9630 | 0.9554 | 0.9913 |
|
| 339 |
+
| 0.0223 | 1.9014 | 65000 | 0.0320 | 0.9485 | 0.9637 | 0.9560 | 0.9913 |
|
| 340 |
+
| 0.0208 | 1.9746 | 67500 | 0.0306 | 0.9507 | 0.9644 | 0.9575 | 0.9917 |
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
### Framework versions
|
| 344 |
+
|
| 345 |
+
- Transformers 5.0.0
|
| 346 |
+
- Pytorch 2.10.0+cu128
|
| 347 |
+
- Datasets 4.0.0
|
| 348 |
+
- Tokenizers 0.22.2
|
config.json
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"XLMRobertaForTokenClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"eos_token_id": 2,
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 768,
|
| 14 |
+
"id2label": {
|
| 15 |
+
"0": "O",
|
| 16 |
+
"1": "B-EN",
|
| 17 |
+
"2": "I-EN",
|
| 18 |
+
"3": "B-ES",
|
| 19 |
+
"4": "I-ES",
|
| 20 |
+
"5": "B-FR",
|
| 21 |
+
"6": "I-FR",
|
| 22 |
+
"7": "B-DE",
|
| 23 |
+
"8": "I-DE",
|
| 24 |
+
"9": "B-IT",
|
| 25 |
+
"10": "I-IT",
|
| 26 |
+
"11": "B-PT",
|
| 27 |
+
"12": "I-PT",
|
| 28 |
+
"13": "B-NL",
|
| 29 |
+
"14": "I-NL",
|
| 30 |
+
"15": "B-VI",
|
| 31 |
+
"16": "I-VI",
|
| 32 |
+
"17": "B-TR",
|
| 33 |
+
"18": "I-TR",
|
| 34 |
+
"19": "B-LA",
|
| 35 |
+
"20": "I-LA",
|
| 36 |
+
"21": "B-ID",
|
| 37 |
+
"22": "I-ID",
|
| 38 |
+
"23": "B-MS",
|
| 39 |
+
"24": "I-MS",
|
| 40 |
+
"25": "B-AF",
|
| 41 |
+
"26": "I-AF",
|
| 42 |
+
"27": "B-SQ",
|
| 43 |
+
"28": "I-SQ",
|
| 44 |
+
"29": "B-IS",
|
| 45 |
+
"30": "I-IS",
|
| 46 |
+
"31": "B-NO",
|
| 47 |
+
"32": "I-NO",
|
| 48 |
+
"33": "B-SV",
|
| 49 |
+
"34": "I-SV",
|
| 50 |
+
"35": "B-DA",
|
| 51 |
+
"36": "I-DA",
|
| 52 |
+
"37": "B-FI",
|
| 53 |
+
"38": "I-FI",
|
| 54 |
+
"39": "B-HU",
|
| 55 |
+
"40": "I-HU",
|
| 56 |
+
"41": "B-PL",
|
| 57 |
+
"42": "I-PL",
|
| 58 |
+
"43": "B-CS",
|
| 59 |
+
"44": "I-CS",
|
| 60 |
+
"45": "B-RO",
|
| 61 |
+
"46": "I-RO",
|
| 62 |
+
"47": "B-RU",
|
| 63 |
+
"48": "I-RU",
|
| 64 |
+
"49": "B-BG",
|
| 65 |
+
"50": "I-BG",
|
| 66 |
+
"51": "B-UK",
|
| 67 |
+
"52": "I-UK",
|
| 68 |
+
"53": "B-SR",
|
| 69 |
+
"54": "I-SR",
|
| 70 |
+
"55": "B-BE",
|
| 71 |
+
"56": "I-BE",
|
| 72 |
+
"57": "B-KK",
|
| 73 |
+
"58": "I-KK",
|
| 74 |
+
"59": "B-MK",
|
| 75 |
+
"60": "I-MK",
|
| 76 |
+
"61": "B-MN",
|
| 77 |
+
"62": "I-MN",
|
| 78 |
+
"63": "B-ZH",
|
| 79 |
+
"64": "I-ZH",
|
| 80 |
+
"65": "B-JA",
|
| 81 |
+
"66": "I-JA",
|
| 82 |
+
"67": "B-KO",
|
| 83 |
+
"68": "I-KO",
|
| 84 |
+
"69": "B-HI",
|
| 85 |
+
"70": "I-HI",
|
| 86 |
+
"71": "B-UR",
|
| 87 |
+
"72": "I-UR",
|
| 88 |
+
"73": "B-BN",
|
| 89 |
+
"74": "I-BN",
|
| 90 |
+
"75": "B-TA",
|
| 91 |
+
"76": "I-TA",
|
| 92 |
+
"77": "B-TE",
|
| 93 |
+
"78": "I-TE",
|
| 94 |
+
"79": "B-MR",
|
| 95 |
+
"80": "I-MR",
|
| 96 |
+
"81": "B-GU",
|
| 97 |
+
"82": "I-GU",
|
| 98 |
+
"83": "B-KN",
|
| 99 |
+
"84": "I-KN",
|
| 100 |
+
"85": "B-ML",
|
| 101 |
+
"86": "I-ML",
|
| 102 |
+
"87": "B-PA",
|
| 103 |
+
"88": "I-PA",
|
| 104 |
+
"89": "B-AS",
|
| 105 |
+
"90": "I-AS",
|
| 106 |
+
"91": "B-OR",
|
| 107 |
+
"92": "I-OR",
|
| 108 |
+
"93": "B-AR",
|
| 109 |
+
"94": "I-AR",
|
| 110 |
+
"95": "B-FA",
|
| 111 |
+
"96": "I-FA",
|
| 112 |
+
"97": "B-PS",
|
| 113 |
+
"98": "I-PS",
|
| 114 |
+
"99": "B-SD",
|
| 115 |
+
"100": "I-SD",
|
| 116 |
+
"101": "B-UG",
|
| 117 |
+
"102": "I-UG",
|
| 118 |
+
"103": "B-EL",
|
| 119 |
+
"104": "I-EL",
|
| 120 |
+
"105": "B-HE",
|
| 121 |
+
"106": "I-HE",
|
| 122 |
+
"107": "B-YI",
|
| 123 |
+
"108": "I-YI",
|
| 124 |
+
"109": "B-HY",
|
| 125 |
+
"110": "I-HY",
|
| 126 |
+
"111": "B-KA",
|
| 127 |
+
"112": "I-KA",
|
| 128 |
+
"113": "B-AM",
|
| 129 |
+
"114": "I-AM",
|
| 130 |
+
"115": "B-KM",
|
| 131 |
+
"116": "I-KM",
|
| 132 |
+
"117": "B-LO",
|
| 133 |
+
"118": "I-LO",
|
| 134 |
+
"119": "B-MY",
|
| 135 |
+
"120": "I-MY",
|
| 136 |
+
"121": "B-TH",
|
| 137 |
+
"122": "I-TH",
|
| 138 |
+
"123": "B-SI",
|
| 139 |
+
"124": "I-SI",
|
| 140 |
+
"125": "B-BO",
|
| 141 |
+
"126": "I-BO",
|
| 142 |
+
"127": "B-DV",
|
| 143 |
+
"128": "I-DV",
|
| 144 |
+
"129": "B-TI",
|
| 145 |
+
"130": "I-TI",
|
| 146 |
+
"131": "B-SW",
|
| 147 |
+
"132": "I-SW",
|
| 148 |
+
"133": "B-EU",
|
| 149 |
+
"134": "I-EU",
|
| 150 |
+
"135": "B-TL",
|
| 151 |
+
"136": "I-TL",
|
| 152 |
+
"137": "B-XH",
|
| 153 |
+
"138": "I-XH",
|
| 154 |
+
"139": "B-CA",
|
| 155 |
+
"140": "I-CA",
|
| 156 |
+
"141": "B-GL",
|
| 157 |
+
"142": "I-GL",
|
| 158 |
+
"143": "B-OC",
|
| 159 |
+
"144": "I-OC",
|
| 160 |
+
"145": "B-BR",
|
| 161 |
+
"146": "I-BR",
|
| 162 |
+
"147": "B-GA",
|
| 163 |
+
"148": "I-GA",
|
| 164 |
+
"149": "B-GD",
|
| 165 |
+
"150": "I-GD",
|
| 166 |
+
"151": "B-CY",
|
| 167 |
+
"152": "I-CY",
|
| 168 |
+
"153": "B-SCO",
|
| 169 |
+
"154": "I-SCO",
|
| 170 |
+
"155": "B-BS",
|
| 171 |
+
"156": "I-BS",
|
| 172 |
+
"157": "B-HR",
|
| 173 |
+
"158": "I-HR",
|
| 174 |
+
"159": "B-SL",
|
| 175 |
+
"160": "I-SL",
|
| 176 |
+
"161": "B-SK",
|
| 177 |
+
"162": "I-SK",
|
| 178 |
+
"163": "B-ET",
|
| 179 |
+
"164": "I-ET",
|
| 180 |
+
"165": "B-LV",
|
| 181 |
+
"166": "I-LV",
|
| 182 |
+
"167": "B-LT",
|
| 183 |
+
"168": "I-LT",
|
| 184 |
+
"169": "B-EO",
|
| 185 |
+
"170": "I-EO",
|
| 186 |
+
"171": "B-JV",
|
| 187 |
+
"172": "I-JV",
|
| 188 |
+
"173": "B-MG",
|
| 189 |
+
"174": "I-MG",
|
| 190 |
+
"175": "B-OM",
|
| 191 |
+
"176": "I-OM",
|
| 192 |
+
"177": "B-SO",
|
| 193 |
+
"178": "I-SO",
|
| 194 |
+
"179": "B-SU",
|
| 195 |
+
"180": "I-SU",
|
| 196 |
+
"181": "B-UZ",
|
| 197 |
+
"182": "I-UZ",
|
| 198 |
+
"183": "B-KU",
|
| 199 |
+
"184": "I-KU",
|
| 200 |
+
"185": "B-CKB",
|
| 201 |
+
"186": "I-CKB",
|
| 202 |
+
"187": "B-NE",
|
| 203 |
+
"188": "I-NE",
|
| 204 |
+
"189": "B-MT",
|
| 205 |
+
"190": "I-MT",
|
| 206 |
+
"191": "B-LB",
|
| 207 |
+
"192": "I-LB",
|
| 208 |
+
"193": "B-RM",
|
| 209 |
+
"194": "I-RM",
|
| 210 |
+
"195": "B-TT",
|
| 211 |
+
"196": "I-TT",
|
| 212 |
+
"197": "B-KY",
|
| 213 |
+
"198": "I-KY",
|
| 214 |
+
"199": "B-TG",
|
| 215 |
+
"200": "I-TG",
|
| 216 |
+
"201": "B-BA",
|
| 217 |
+
"202": "I-BA",
|
| 218 |
+
"203": "B-YO",
|
| 219 |
+
"204": "I-YO",
|
| 220 |
+
"205": "B-ZU",
|
| 221 |
+
"206": "I-ZU",
|
| 222 |
+
"207": "B-NY",
|
| 223 |
+
"208": "I-NY",
|
| 224 |
+
"209": "B-CE",
|
| 225 |
+
"210": "I-CE"
|
| 226 |
+
},
|
| 227 |
+
"initializer_range": 0.02,
|
| 228 |
+
"intermediate_size": 3072,
|
| 229 |
+
"is_decoder": false,
|
| 230 |
+
"label2id": {
|
| 231 |
+
"B-AF": 25,
|
| 232 |
+
"B-AM": 113,
|
| 233 |
+
"B-AR": 93,
|
| 234 |
+
"B-AS": 89,
|
| 235 |
+
"B-BA": 201,
|
| 236 |
+
"B-BE": 55,
|
| 237 |
+
"B-BG": 49,
|
| 238 |
+
"B-BN": 73,
|
| 239 |
+
"B-BO": 125,
|
| 240 |
+
"B-BR": 145,
|
| 241 |
+
"B-BS": 155,
|
| 242 |
+
"B-CA": 139,
|
| 243 |
+
"B-CE": 209,
|
| 244 |
+
"B-CKB": 185,
|
| 245 |
+
"B-CS": 43,
|
| 246 |
+
"B-CY": 151,
|
| 247 |
+
"B-DA": 35,
|
| 248 |
+
"B-DE": 7,
|
| 249 |
+
"B-DV": 127,
|
| 250 |
+
"B-EL": 103,
|
| 251 |
+
"B-EN": 1,
|
| 252 |
+
"B-EO": 169,
|
| 253 |
+
"B-ES": 3,
|
| 254 |
+
"B-ET": 163,
|
| 255 |
+
"B-EU": 133,
|
| 256 |
+
"B-FA": 95,
|
| 257 |
+
"B-FI": 37,
|
| 258 |
+
"B-FR": 5,
|
| 259 |
+
"B-GA": 147,
|
| 260 |
+
"B-GD": 149,
|
| 261 |
+
"B-GL": 141,
|
| 262 |
+
"B-GU": 81,
|
| 263 |
+
"B-HE": 105,
|
| 264 |
+
"B-HI": 69,
|
| 265 |
+
"B-HR": 157,
|
| 266 |
+
"B-HU": 39,
|
| 267 |
+
"B-HY": 109,
|
| 268 |
+
"B-ID": 21,
|
| 269 |
+
"B-IS": 29,
|
| 270 |
+
"B-IT": 9,
|
| 271 |
+
"B-JA": 65,
|
| 272 |
+
"B-JV": 171,
|
| 273 |
+
"B-KA": 111,
|
| 274 |
+
"B-KK": 57,
|
| 275 |
+
"B-KM": 115,
|
| 276 |
+
"B-KN": 83,
|
| 277 |
+
"B-KO": 67,
|
| 278 |
+
"B-KU": 183,
|
| 279 |
+
"B-KY": 197,
|
| 280 |
+
"B-LA": 19,
|
| 281 |
+
"B-LB": 191,
|
| 282 |
+
"B-LO": 117,
|
| 283 |
+
"B-LT": 167,
|
| 284 |
+
"B-LV": 165,
|
| 285 |
+
"B-MG": 173,
|
| 286 |
+
"B-MK": 59,
|
| 287 |
+
"B-ML": 85,
|
| 288 |
+
"B-MN": 61,
|
| 289 |
+
"B-MR": 79,
|
| 290 |
+
"B-MS": 23,
|
| 291 |
+
"B-MT": 189,
|
| 292 |
+
"B-MY": 119,
|
| 293 |
+
"B-NE": 187,
|
| 294 |
+
"B-NL": 13,
|
| 295 |
+
"B-NO": 31,
|
| 296 |
+
"B-NY": 207,
|
| 297 |
+
"B-OC": 143,
|
| 298 |
+
"B-OM": 175,
|
| 299 |
+
"B-OR": 91,
|
| 300 |
+
"B-PA": 87,
|
| 301 |
+
"B-PL": 41,
|
| 302 |
+
"B-PS": 97,
|
| 303 |
+
"B-PT": 11,
|
| 304 |
+
"B-RM": 193,
|
| 305 |
+
"B-RO": 45,
|
| 306 |
+
"B-RU": 47,
|
| 307 |
+
"B-SCO": 153,
|
| 308 |
+
"B-SD": 99,
|
| 309 |
+
"B-SI": 123,
|
| 310 |
+
"B-SK": 161,
|
| 311 |
+
"B-SL": 159,
|
| 312 |
+
"B-SO": 177,
|
| 313 |
+
"B-SQ": 27,
|
| 314 |
+
"B-SR": 53,
|
| 315 |
+
"B-SU": 179,
|
| 316 |
+
"B-SV": 33,
|
| 317 |
+
"B-SW": 131,
|
| 318 |
+
"B-TA": 75,
|
| 319 |
+
"B-TE": 77,
|
| 320 |
+
"B-TG": 199,
|
| 321 |
+
"B-TH": 121,
|
| 322 |
+
"B-TI": 129,
|
| 323 |
+
"B-TL": 135,
|
| 324 |
+
"B-TR": 17,
|
| 325 |
+
"B-TT": 195,
|
| 326 |
+
"B-UG": 101,
|
| 327 |
+
"B-UK": 51,
|
| 328 |
+
"B-UR": 71,
|
| 329 |
+
"B-UZ": 181,
|
| 330 |
+
"B-VI": 15,
|
| 331 |
+
"B-XH": 137,
|
| 332 |
+
"B-YI": 107,
|
| 333 |
+
"B-YO": 203,
|
| 334 |
+
"B-ZH": 63,
|
| 335 |
+
"B-ZU": 205,
|
| 336 |
+
"I-AF": 26,
|
| 337 |
+
"I-AM": 114,
|
| 338 |
+
"I-AR": 94,
|
| 339 |
+
"I-AS": 90,
|
| 340 |
+
"I-BA": 202,
|
| 341 |
+
"I-BE": 56,
|
| 342 |
+
"I-BG": 50,
|
| 343 |
+
"I-BN": 74,
|
| 344 |
+
"I-BO": 126,
|
| 345 |
+
"I-BR": 146,
|
| 346 |
+
"I-BS": 156,
|
| 347 |
+
"I-CA": 140,
|
| 348 |
+
"I-CE": 210,
|
| 349 |
+
"I-CKB": 186,
|
| 350 |
+
"I-CS": 44,
|
| 351 |
+
"I-CY": 152,
|
| 352 |
+
"I-DA": 36,
|
| 353 |
+
"I-DE": 8,
|
| 354 |
+
"I-DV": 128,
|
| 355 |
+
"I-EL": 104,
|
| 356 |
+
"I-EN": 2,
|
| 357 |
+
"I-EO": 170,
|
| 358 |
+
"I-ES": 4,
|
| 359 |
+
"I-ET": 164,
|
| 360 |
+
"I-EU": 134,
|
| 361 |
+
"I-FA": 96,
|
| 362 |
+
"I-FI": 38,
|
| 363 |
+
"I-FR": 6,
|
| 364 |
+
"I-GA": 148,
|
| 365 |
+
"I-GD": 150,
|
| 366 |
+
"I-GL": 142,
|
| 367 |
+
"I-GU": 82,
|
| 368 |
+
"I-HE": 106,
|
| 369 |
+
"I-HI": 70,
|
| 370 |
+
"I-HR": 158,
|
| 371 |
+
"I-HU": 40,
|
| 372 |
+
"I-HY": 110,
|
| 373 |
+
"I-ID": 22,
|
| 374 |
+
"I-IS": 30,
|
| 375 |
+
"I-IT": 10,
|
| 376 |
+
"I-JA": 66,
|
| 377 |
+
"I-JV": 172,
|
| 378 |
+
"I-KA": 112,
|
| 379 |
+
"I-KK": 58,
|
| 380 |
+
"I-KM": 116,
|
| 381 |
+
"I-KN": 84,
|
| 382 |
+
"I-KO": 68,
|
| 383 |
+
"I-KU": 184,
|
| 384 |
+
"I-KY": 198,
|
| 385 |
+
"I-LA": 20,
|
| 386 |
+
"I-LB": 192,
|
| 387 |
+
"I-LO": 118,
|
| 388 |
+
"I-LT": 168,
|
| 389 |
+
"I-LV": 166,
|
| 390 |
+
"I-MG": 174,
|
| 391 |
+
"I-MK": 60,
|
| 392 |
+
"I-ML": 86,
|
| 393 |
+
"I-MN": 62,
|
| 394 |
+
"I-MR": 80,
|
| 395 |
+
"I-MS": 24,
|
| 396 |
+
"I-MT": 190,
|
| 397 |
+
"I-MY": 120,
|
| 398 |
+
"I-NE": 188,
|
| 399 |
+
"I-NL": 14,
|
| 400 |
+
"I-NO": 32,
|
| 401 |
+
"I-NY": 208,
|
| 402 |
+
"I-OC": 144,
|
| 403 |
+
"I-OM": 176,
|
| 404 |
+
"I-OR": 92,
|
| 405 |
+
"I-PA": 88,
|
| 406 |
+
"I-PL": 42,
|
| 407 |
+
"I-PS": 98,
|
| 408 |
+
"I-PT": 12,
|
| 409 |
+
"I-RM": 194,
|
| 410 |
+
"I-RO": 46,
|
| 411 |
+
"I-RU": 48,
|
| 412 |
+
"I-SCO": 154,
|
| 413 |
+
"I-SD": 100,
|
| 414 |
+
"I-SI": 124,
|
| 415 |
+
"I-SK": 162,
|
| 416 |
+
"I-SL": 160,
|
| 417 |
+
"I-SO": 178,
|
| 418 |
+
"I-SQ": 28,
|
| 419 |
+
"I-SR": 54,
|
| 420 |
+
"I-SU": 180,
|
| 421 |
+
"I-SV": 34,
|
| 422 |
+
"I-SW": 132,
|
| 423 |
+
"I-TA": 76,
|
| 424 |
+
"I-TE": 78,
|
| 425 |
+
"I-TG": 200,
|
| 426 |
+
"I-TH": 122,
|
| 427 |
+
"I-TI": 130,
|
| 428 |
+
"I-TL": 136,
|
| 429 |
+
"I-TR": 18,
|
| 430 |
+
"I-TT": 196,
|
| 431 |
+
"I-UG": 102,
|
| 432 |
+
"I-UK": 52,
|
| 433 |
+
"I-UR": 72,
|
| 434 |
+
"I-UZ": 182,
|
| 435 |
+
"I-VI": 16,
|
| 436 |
+
"I-XH": 138,
|
| 437 |
+
"I-YI": 108,
|
| 438 |
+
"I-YO": 204,
|
| 439 |
+
"I-ZH": 64,
|
| 440 |
+
"I-ZU": 206,
|
| 441 |
+
"O": 0
|
| 442 |
+
},
|
| 443 |
+
"layer_norm_eps": 1e-05,
|
| 444 |
+
"max_position_embeddings": 514,
|
| 445 |
+
"model_type": "xlm-roberta",
|
| 446 |
+
"num_attention_heads": 12,
|
| 447 |
+
"num_hidden_layers": 12,
|
| 448 |
+
"output_past": true,
|
| 449 |
+
"pad_token_id": 1,
|
| 450 |
+
"position_embedding_type": "absolute",
|
| 451 |
+
"tie_word_embeddings": true,
|
| 452 |
+
"transformers_version": "5.0.0",
|
| 453 |
+
"type_vocab_size": 1,
|
| 454 |
+
"use_cache": false,
|
| 455 |
+
"vocab_size": 250002
|
| 456 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2455b960fbdfa72ad7b04a55ea6dc99bddb425c54bb3f4023295d9a7470a4d44
|
| 3 |
+
size 1110485292
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a5451f31fe3f899dcd75ec2ad93f415528c9b5f58bb7a5a1c6dd5884fb56257
|
| 3 |
+
size 16781486
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<s>",
|
| 5 |
+
"cls_token": "<s>",
|
| 6 |
+
"eos_token": "</s>",
|
| 7 |
+
"is_local": false,
|
| 8 |
+
"mask_token": "<mask>",
|
| 9 |
+
"model_max_length": 512,
|
| 10 |
+
"pad_token": "<pad>",
|
| 11 |
+
"sep_token": "</s>",
|
| 12 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 13 |
+
"unk_token": "<unk>"
|
| 14 |
+
}
|
trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d5d81020e6bbd67f3b2229ebb1af43c3661c83bb3947ad30ea05b011fc7aa50
|
| 3 |
+
size 5201
|