Upload 8 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +348 -0
config.json +456 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +14 -0
trainer_state.json +0 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,348 @@

+---
+library_name: transformers
+license: mit
+base_model: xlm-roberta-base
+tags:
+- generated_from_trainer
+- language-identification
+- codeswitching
+metrics:
+- precision
+- recall
+- f1
+- accuracy
+language:
+- multilingual
+- af
+- am
+- ar
+- as
+- ba
+- be
+- bg
+- bn
+- bo
+- br
+- bs
+- ca
+- ce
+- ckb
+- cs
+- cy
+- da
+- de
+- dv
+- el
+- en
+- eo
+- es
+- et
+- eu
+- fa
+- fi
+- fr
+- ga
+- gd
+- gl
+- gu
+- he
+- hi
+- hr
+- hu
+- hy
+- id
+- is
+- it
+- ja
+- jv
+- ka
+- kk
+- km
+- kn
+- ko
+- ku
+- ky
+- la
+- lb
+- lo
+- lt
+- lv
+- mg
+- mk
+- ml
+- mn
+- mr
+- ms
+- mt
+- my
+- ne
+- nl
+- 'no'
+- ny
+- oc
+- om
+- or
+- pa
+- pl
+- ps
+- pt
+- rm
+- ro
+- ru
+- sd
+- si
+- sk
+- sl
+- so
+- sq
+- sr
+- su
+- sv
+- sw
+- ta
+- te
+- tg
+- th
+- ti
+- tl
+- tr
+- tt
+- ug
+- uk
+- ur
+- uz
+- vi
+- yo
+- yi
+- zh
+- zu
+model-index:
+- name: polyglot-tagger
+  results: []
+datasets:
+- wikimedia/wikipedia
+- HuggingFaceFW/finetranslations
+- google/smol
+- DerivedFunction/nlp-noise-snippets
+- DerivedFunction/wikipedia-language-snippets-filtered
+- DerivedFunction/finetranslations-filtered
+- DerivedFunction/tatoeba-filtered
+pipeline_tag: token-classification
+---
+![image](https://cdn-uploads.huggingface.co/production/uploads/67ee3f0a66388136438834cc/OnfV_fN2br5c4cPnOn6O0.png)
+Fine-tuned `xlm-roberta-base` for sentence-level language tagging across 100 languages.
+The model predicts BIO-style language tags over tokens, which makes it useful for
+language identification, code-switch detection, and multilingual document analysis.
+> Compared to version 2.2, this version had training data that attempted to fix the model scoring common grade-school words from major langauges as low confidence or in a minor language bucket.
+## Model description
+Introducing Polyglot Tagger, a new way to classify multi-lingual documents. By training specifically on token classification on individual sentences, the model
+generalizes well on a variety of languages, while also behaves as a multi-label classifier, and extracts sentences based on its language.
+## Intended uses & limitations
+This model can be treated as a base model for further fine-tuning on specific language identification extraction tasks.
+Note that as a general language tagging model, it can potentially get confused from shared language families or from short texts. For example, Danish and Norwegian, Spanish and Portuguese, and Russian and Ukrainian.
+The model is trained on a sentence with a minimum of four tokens, so it may not accurately classify very short and ambigous statements. Note that this model is experimental
+and may produce unexpected results compared to generic text classifiers. It is trained on cleaned text, therefore, "messy" text may unexpectedly produce different results.
+> Note that Romanized versions of any language may have no representation in the training set, such as Romanized Russian, and Hindi.
+### Training and Evaluation Data
+A synthetic training row consists of 1-6 individual and mostly independent sentences extracted from various sources. To generalize well against multiple languages, several
+factors were used to simulate messy text, and to reduce single character bias on certain languages:
+- Low chance of deliberate accent stripping for languages such as Spanish and Portugeuse
+- Random chance to add in, replace or delete punctuation, numeric, and delimiter artifiacts
+- Insert same-script alphabets to family language. For example, randomly injecting Arabic characters in Arabic languages
+- Random chance to change the casing of compatible language scripts, such as Latin and Cyrllic.
+- Low chance of simulating OCR and messy text with character mutation.
+To generalize well on both the target language and code switching a curriculum is provided:
+- Pure documents 55%: Single language to learn its vocabulary, simulating a short paragraph of a single language.
+- Homogenous 25%: Single language + one foreign sentence to learn simple code switching.
+- Spliced 10%: A foreign sentence is centered between two same-language sentence, with the first sentence's punctuation stripped, and second sentence's forced to be lowercased.
+- Mixed 10%: Generic mix of any languages.
+| lang | train sentences | train tokens | eval sentences | eval tokens | all sentences | all tokens |
+| :--- | ---: | ---: | ---: | ---: | ---: | ---: |
+| en | 423264 (2.41%) | 9841704 (1.71%) | 3157 (3.84%) | 35025 (1.79%) | 426421 (2.41%) | 9876729 (1.71%) |
+| es | 359106 (2.04%) | 9729675 (1.69%) | 2201 (2.68%) | 22340 (1.14%) | 361307 (2.04%) | 9752015 (1.69%) |
+| ru | 356083 (2.02%) | 8945224 (1.56%) | 2226 (2.71%) | 21578 (1.10%) | 358309 (2.03%) | 8966802 (1.55%) |
+| fr | 354645 (2.02%) | 10591338 (1.84%) | 2213 (2.69%) | 26148 (1.34%) | 356858 (2.02%) | 10617486 (1.84%) |
+| ja | 352243 (2.00%) | 7945312 (1.38%) | 2219 (2.70%) | 25849 (1.32%) | 354462 (2.01%) | 7971161 (1.38%) |
+| pt | 346042 (1.97%) | 8793674 (1.53%) | 2059 (2.50%) | 20881 (1.07%) | 348101 (1.97%) | 8814555 (1.53%) |
+| de | 344644 (1.96%) | 8847457 (1.54%) | 2151 (2.61%) | 24958 (1.27%) | 346795 (1.96%) | 8872415 (1.54%) |
+| it | 343887 (1.95%) | 8790806 (1.53%) | 2000 (2.43%) | 17342 (0.89%) | 345887 (1.96%) | 8808148 (1.53%) |
+| fi | 299568 (1.70%) | 6905536 (1.20%) | 1576 (1.92%) | 14458 (0.74%) | 301144 (1.70%) | 6919994 (1.20%) |
+| uk | 297565 (1.69%) | 7228987 (1.26%) | 1398 (1.70%) | 11391 (0.58%) | 298963 (1.69%) | 7240378 (1.26%) |
+| zh | 294064 (1.67%) | 7329413 (1.28%) | 1717 (2.09%) | 33433 (1.71%) | 295781 (1.67%) | 7362846 (1.28%) |
+| tr | 289328 (1.64%) | 6606625 (1.15%) | 1384 (1.68%) | 11089 (0.57%) | 290712 (1.64%) | 6617714 (1.15%) |
+| he | 289239 (1.64%) | 7792338 (1.36%) | 1100 (1.34%) | 10342 (0.53%) | 290339 (1.64%) | 7802680 (1.35%) |
+| pl | 288423 (1.64%) | 7707293 (1.34%) | 1305 (1.59%) | 11306 (0.58%) | 289728 (1.64%) | 7718599 (1.34%) |
+| hu | 286880 (1.63%) | 7547282 (1.31%) | 1232 (1.50%) | 11115 (0.57%) | 288112 (1.63%) | 7558397 (1.31%) |
+| nl | 280682 (1.60%) | 6057971 (1.05%) | 1296 (1.58%) | 10453 (0.53%) | 281978 (1.60%) | 6068424 (1.05%) |
+| lt | 273386 (1.55%) | 6507776 (1.13%) | 1165 (1.42%) | 10491 (0.54%) | 274551 (1.55%) | 6518267 (1.13%) |
+| eo | 267885 (1.52%) | 6554654 (1.14%) | 1055 (1.28%) | 13271 (0.68%) | 268940 (1.52%) | 6567925 (1.14%) |
+| ar | 257437 (1.46%) | 5125573 (0.89%) | 1327 (1.61%) | 15180 (0.78%) | 258764 (1.46%) | 5140753 (0.89%) |
+| cs | 240507 (1.37%) | 6406086 (1.11%) | 1082 (1.32%) | 9473 (0.48%) | 241589 (1.37%) | 6415559 (1.11%) |
+| mk | 231103 (1.31%) | 6478376 (1.13%) | 953 (1.16%) | 7713 (0.39%) | 232056 (1.31%) | 6486089 (1.12%) |
+| mr | 228596 (1.30%) | 5886608 (1.02%) | 776 (0.94%) | 6332 (0.32%) | 229372 (1.30%) | 5892940 (1.02%) |
+| no | 223605 (1.27%) | 6137131 (1.07%) | 1396 (1.70%) | 40226 (2.05%) | 225001 (1.27%) | 6177357 (1.07%) |
+| da | 222243 (1.26%) | 5375746 (0.94%) | 1201 (1.46%) | 10373 (0.53%) | 223444 (1.26%) | 5386119 (0.93%) |
+| hy | 207937 (1.18%) | 6345675 (1.10%) | 791 (0.96%) | 9276 (0.47%) | 208728 (1.18%) | 6354951 (1.10%) |
+| tl | 207674 (1.18%) | 5561702 (0.97%) | 1017 (1.24%) | 10926 (0.56%) | 208691 (1.18%) | 5572628 (0.97%) |
+| hi | 206552 (1.17%) | 7796062 (1.36%) | 1079 (1.31%) | 47351 (2.42%) | 207631 (1.17%) | 7843413 (1.36%) |
+| ko | 205625 (1.17%) | 6481034 (1.13%) | 1156 (1.41%) | 32355 (1.65%) | 206781 (1.17%) | 6513389 (1.13%) |
+| el | 202334 (1.15%) | 7105554 (1.24%) | 826 (1.00%) | 13412 (0.68%) | 203160 (1.15%) | 7118966 (1.23%) |
+| ro | 194999 (1.11%) | 6206913 (1.08%) | 820 (1.00%) | 14575 (0.74%) | 195819 (1.11%) | 6221488 (1.08%) |
+| fa | 192050 (1.09%) | 5728246 (1.00%) | 696 (0.85%) | 14765 (0.75%) | 192746 (1.09%) | 5743011 (1.00%) |
+| sk | 189330 (1.08%) | 5318617 (0.93%) | 873 (1.06%) | 20779 (1.06%) | 190203 (1.08%) | 5339396 (0.93%) |
+| la | 188201 (1.07%) | 4591159 (0.80%) | 824 (1.00%) | 8557 (0.44%) | 189025 (1.07%) | 4599716 (0.80%) |
+| bg | 187685 (1.07%) | 5860353 (1.02%) | 762 (0.93%) | 16804 (0.86%) | 188447 (1.07%) | 5877157 (1.02%) |
+| be | 181543 (1.03%) | 6528657 (1.14%) | 869 (1.06%) | 25944 (1.32%) | 182412 (1.03%) | 6554601 (1.14%) |
+| is | 180452 (1.03%) | 6146455 (1.07%) | 959 (1.17%) | 39591 (2.02%) | 181411 (1.03%) | 6186046 (1.07%) |
+| lv | 179142 (1.02%) | 5867897 (1.02%) | 762 (0.93%) | 33481 (1.71%) | 179904 (1.02%) | 5901378 (1.02%) |
+| ckb | 174282 (0.99%) | 7825141 (1.36%) | 667 (0.81%) | 28756 (1.47%) | 174949 (0.99%) | 7853897 (1.36%) |
+| ms | 172573 (0.98%) | 4614764 (0.80%) | 815 (0.99%) | 24769 (1.26%) | 173388 (0.98%) | 4639533 (0.80%) |
+| ka | 170876 (0.97%) | 5505127 (0.96%) | 673 (0.82%) | 20651 (1.05%) | 171549 (0.97%) | 5525778 (0.96%) |
+| kk | 170695 (0.97%) | 5132560 (0.89%) | 676 (0.82%) | 18695 (0.95%) | 171371 (0.97%) | 5151255 (0.89%) |
+| bn | 170721 (0.97%) | 6393448 (1.11%) | 441 (0.54%) | 14727 (0.75%) | 171162 (0.97%) | 6408175 (1.11%) |
+| eu | 168462 (0.96%) | 5737310 (1.00%) | 746 (0.91%) | 37196 (1.90%) | 169208 (0.96%) | 5774506 (1.00%) |
+| as | 168746 (0.96%) | 8564682 (1.49%) | 445 (0.54%) | 24444 (1.25%) | 169191 (0.96%) | 8589126 (1.49%) |
+| mn | 167543 (0.95%) | 5678049 (0.99%) | 703 (0.85%) | 20347 (1.04%) | 168246 (0.95%) | 5698396 (0.99%) |
+| ur | 165992 (0.94%) | 5361179 (0.93%) | 684 (0.83%) | 22622 (1.16%) | 166676 (0.94%) | 5383801 (0.93%) |
+| oc | 165863 (0.94%) | 5735536 (1.00%) | 730 (0.89%) | 18599 (0.95%) | 166593 (0.94%) | 5754135 (1.00%) |
+| ba | 164919 (0.94%) | 8387828 (1.46%) | 699 (0.85%) | 35927 (1.83%) | 165618 (0.94%) | 8423755 (1.46%) |
+| th | 164429 (0.93%) | 5495248 (0.96%) | 649 (0.79%) | 22113 (1.13%) | 165078 (0.93%) | 5517361 (0.96%) |
+| ky | 164374 (0.93%) | 5199548 (0.90%) | 683 (0.83%) | 18956 (0.97%) | 165057 (0.93%) | 5218504 (0.90%) |
+| hr | 163828 (0.93%) | 5183677 (0.90%) | 711 (0.86%) | 33845 (1.73%) | 164539 (0.93%) | 5217522 (0.90%) |
+| ps | 163238 (0.93%) | 4735113 (0.82%) | 674 (0.82%) | 18515 (0.95%) | 163912 (0.93%) | 4753628 (0.82%) |
+| id | 163187 (0.93%) | 4025079 (0.70%) | 723 (0.88%) | 13371 (0.68%) | 163910 (0.93%) | 4038450 (0.70%) |
+| pa | 162180 (0.92%) | 7621059 (1.33%) | 581 (0.71%) | 29036 (1.48%) | 162761 (0.92%) | 7650095 (1.33%) |
+| sw | 161777 (0.92%) | 5013161 (0.87%) | 653 (0.79%) | 26493 (1.35%) | 162430 (0.92%) | 5039654 (0.87%) |
+| af | 160455 (0.91%) | 4676798 (0.81%) | 932 (1.13%) | 27369 (1.40%) | 161387 (0.91%) | 4704167 (0.82%) |
+| jv | 156292 (0.89%) | 4752381 (0.83%) | 576 (0.70%) | 22573 (1.15%) | 156868 (0.89%) | 4774954 (0.83%) |
+| tt | 154833 (0.88%) | 5165763 (0.90%) | 578 (0.70%) | 7298 (0.37%) | 155411 (0.88%) | 5173061 (0.90%) |
+| cy | 153551 (0.87%) | 5656404 (0.98%) | 653 (0.79%) | 29503 (1.51%) | 154204 (0.87%) | 5685907 (0.99%) |
+| ga | 150458 (0.86%) | 5488243 (0.95%) | 680 (0.83%) | 33471 (1.71%) | 151138 (0.86%) | 5521714 (0.96%) |
+| kn | 150184 (0.85%) | 14992479 (2.61%) | 697 (0.85%) | 49288 (2.52%) | 150881 (0.85%) | 15041767 (2.61%) |
+| bs | 150037 (0.85%) | 4582900 (0.80%) | 649 (0.79%) | 25588 (1.31%) | 150686 (0.85%) | 4608488 (0.80%) |
+| ca | 149401 (0.85%) | 5477662 (0.95%) | 629 (0.76%) | 21391 (1.09%) | 150030 (0.85%) | 5499053 (0.95%) |
+| ne | 148716 (0.85%) | 4855198 (0.84%) | 535 (0.65%) | 16246 (0.83%) | 149251 (0.84%) | 4871444 (0.84%) |
+| ku | 147702 (0.84%) | 4973601 (0.87%) | 574 (0.70%) | 28196 (1.44%) | 148276 (0.84%) | 5001797 (0.87%) |
+| gl | 147011 (0.84%) | 4554907 (0.79%) | 658 (0.80%) | 20127 (1.03%) | 147669 (0.84%) | 4575034 (0.79%) |
+| uz | 145433 (0.83%) | 4704898 (0.82%) | 573 (0.70%) | 21862 (1.12%) | 146006 (0.83%) | 4726760 (0.82%) |
+| sl | 144084 (0.82%) | 3851696 (0.67%) | 651 (0.79%) | 18164 (0.93%) | 144735 (0.82%) | 3869860 (0.67%) |
+| sv | 143041 (0.81%) | 4006332 (0.70%) | 905 (1.10%) | 7012 (0.36%) | 143946 (0.81%) | 4013344 (0.70%) |
+| tg | 136703 (0.78%) | 7664329 (1.33%) | 572 (0.70%) | 34220 (1.75%) | 137275 (0.78%) | 7698549 (1.33%) |
+| et | 131007 (0.74%) | 3280590 (0.57%) | 549 (0.67%) | 14021 (0.72%) | 131556 (0.74%) | 3294611 (0.57%) |
+| br | 130223 (0.74%) | 4495403 (0.78%) | 546 (0.66%) | 17304 (0.88%) | 130769 (0.74%) | 4512707 (0.78%) |
+| lb | 129528 (0.74%) | 4421411 (0.77%) | 495 (0.60%) | 17761 (0.91%) | 130023 (0.74%) | 4439172 (0.77%) |
+| su | 129144 (0.73%) | 4215719 (0.73%) | 535 (0.65%) | 21391 (1.09%) | 129679 (0.73%) | 4237110 (0.73%) |
+| mt | 128626 (0.73%) | 6671441 (1.16%) | 508 (0.62%) | 26729 (1.36%) | 129134 (0.73%) | 6698170 (1.16%) |
+| sq | 119431 (0.68%) | 4107917 (0.71%) | 561 (0.68%) | 18633 (0.95%) | 119992 (0.68%) | 4126550 (0.72%) |
+| sr | 117855 (0.67%) | 3160599 (0.55%) | 427 (0.52%) | 3505 (0.18%) | 118282 (0.67%) | 3164104 (0.55%) |
+| or | 110709 (0.63%) | 3922431 (0.68%) | 410 (0.50%) | 13094 (0.67%) | 111119 (0.63%) | 3935525 (0.68%) |
+| ml | 110085 (0.63%) | 10929013 (1.90%) | 464 (0.56%) | 36922 (1.89%) | 110549 (0.63%) | 10965935 (1.90%) |
+| yi | 104494 (0.59%) | 4085563 (0.71%) | 400 (0.49%) | 6005 (0.31%) | 104894 (0.59%) | 4091568 (0.71%) |
+| te | 101076 (0.57%) | 9757033 (1.70%) | 430 (0.52%) | 37897 (1.94%) | 101506 (0.57%) | 9794930 (1.70%) |
+| ta | 94122 (0.53%) | 7917169 (1.38%) | 386 (0.47%) | 26610 (1.36%) | 94508 (0.53%) | 7943779 (1.38%) |
+| mg | 93939 (0.53%) | 3291017 (0.57%) | 391 (0.48%) | 11698 (0.60%) | 94330 (0.53%) | 3302715 (0.57%) |
+| si | 92723 (0.53%) | 5275463 (0.92%) | 364 (0.44%) | 18426 (0.94%) | 93087 (0.53%) | 5293889 (0.92%) |
+| vi | 74916 (0.43%) | 2535825 (0.44%) | 335 (0.41%) | 3396 (0.17%) | 75251 (0.43%) | 2539221 (0.44%) |
+| rm | 74806 (0.43%) | 2826708 (0.49%) | 318 (0.39%) | 12654 (0.65%) | 75124 (0.43%) | 2839362 (0.49%) |
+| gu | 70961 (0.40%) | 7859622 (1.37%) | 335 (0.41%) | 28389 (1.45%) | 71296 (0.40%) | 7888011 (1.37%) |
+| bo | 69565 (0.40%) | 1378245 (0.24%) | 263 (0.32%) | 5407 (0.28%) | 69828 (0.40%) | 1383652 (0.24%) |
+| ug | 64297 (0.37%) | 1427585 (0.25%) | 260 (0.32%) | 4769 (0.24%) | 64557 (0.37%) | 1432354 (0.25%) |
+| dv | 60328 (0.34%) | 1557497 (0.27%) | 215 (0.26%) | 5844 (0.30%) | 60543 (0.34%) | 1563341 (0.27%) |
+| am | 59339 (0.34%) | 2705311 (0.47%) | 235 (0.29%) | 10768 (0.55%) | 59574 (0.34%) | 2716079 (0.47%) |
+| yo | 59246 (0.34%) | 3649130 (0.63%) | 260 (0.32%) | 21157 (1.08%) | 59506 (0.34%) | 3670287 (0.64%) |
+| my | 58575 (0.33%) | 2165089 (0.38%) | 214 (0.26%) | 8142 (0.42%) | 58789 (0.33%) | 2173231 (0.38%) |
+| km | 57081 (0.32%) | 3056236 (0.53%) | 193 (0.23%) | 10606 (0.54%) | 57274 (0.32%) | 3066842 (0.53%) |
+| so | 56160 (0.32%) | 2044409 (0.36%) | 212 (0.26%) | 8847 (0.45%) | 56372 (0.32%) | 2053256 (0.36%) |
+| sd | 55359 (0.31%) | 3226018 (0.56%) | 217 (0.26%) | 10847 (0.55%) | 55576 (0.31%) | 3236865 (0.56%) |
+| zu | 52465 (0.30%) | 2406841 (0.42%) | 203 (0.25%) | 9751 (0.50%) | 52668 (0.30%) | 2416592 (0.42%) |
+| lo | 50641 (0.29%) | 1747495 (0.30%) | 189 (0.23%) | 6221 (0.32%) | 50830 (0.29%) | 1753716 (0.30%) |
+| ti | 47785 (0.27%) | 2895617 (0.50%) | 195 (0.24%) | 12699 (0.65%) | 47980 (0.27%) | 2908316 (0.50%) |
+| ce | 45014 (0.26%) | 2425219 (0.42%) | 188 (0.23%) | 9950 (0.51%) | 45202 (0.26%) | 2435169 (0.42%) |
+| ny | 43552 (0.25%) | 2051132 (0.36%) | 171 (0.21%) | 8286 (0.42%) | 43723 (0.25%) | 2059418 (0.36%) |
+| gd | 36623 (0.21%) | 1273243 (0.22%) | 156 (0.19%) | 3615 (0.18%) | 36779 (0.21%) | 1276858 (0.22%) |
+| xh | 24432 (0.14%) | 911850 (0.16%) | 93 (0.11%) | 3528 (0.18%) | 24525 (0.14%) | 915378 (0.16%) |
+| om | 15372 (0.09%) | 545603 (0.09%) | 77 (0.09%) | 2564 (0.13%) | 15449 (0.09%) | 548167 (0.10%) |
+| sco | 8772 (0.05%) | 233030 (0.04%) | 37 (0.04%) | 828 (0.04%) | 8809 (0.05%) | 233858 (0.04%) |
+| **total** | 17593786 (100.00%) | 574735483 (100.00%) | 82270 (100.00%) | 1958217 (100.00%) | 17676056 (100.00%) | 576693700 (100.00%) |
+This model is a fine-tuned version of [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0306
+- Precision: 0.9507
+- Recall: 0.9644
+- F1: 0.9575
+- Accuracy: 0.9917
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 72
+- eval_batch_size: 36
+- seed: 42
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 144
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- num_epochs: 2
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch  | Step  | Validation Loss | Precision | Recall | F1     | Accuracy |
+|:-------------:|:------:|:-----:|:---------------:|:---------:|:------:|:------:|:--------:|
+| 0.0918        | 0.0731 | 2500  | 0.1050          | 0.7984    | 0.8818 | 0.8381 | 0.9735   |
+| 0.0717        | 0.1463 | 5000  | 0.0797          | 0.8393    | 0.9041 | 0.8705 | 0.9782   |
+| 0.0624        | 0.2194 | 7500  | 0.0762          | 0.8664    | 0.9166 | 0.8908 | 0.9804   |
+| 0.0562        | 0.2925 | 10000 | 0.0620          | 0.8758    | 0.9247 | 0.8995 | 0.9830   |
+| 0.0516        | 0.3657 | 12500 | 0.0576          | 0.8844    | 0.9298 | 0.9065 | 0.9845   |
+| 0.0465        | 0.4388 | 15000 | 0.0543          | 0.8993    | 0.9357 | 0.9172 | 0.9857   |
+| 0.0433        | 0.5119 | 17500 | 0.0558          | 0.9005    | 0.9356 | 0.9177 | 0.9856   |
+| 0.0411        | 0.5851 | 20000 | 0.0499          | 0.9012    | 0.9385 | 0.9195 | 0.9867   |
+| 0.0420        | 0.6582 | 22500 | 0.0460          | 0.9167    | 0.9438 | 0.9300 | 0.9873   |
+| 0.0392        | 0.7313 | 25000 | 0.0441          | 0.9149    | 0.9448 | 0.9296 | 0.9878   |
+| 0.0386        | 0.8045 | 27500 | 0.0434          | 0.9200    | 0.9476 | 0.9336 | 0.9885   |
+| 0.0357        | 0.8776 | 30000 | 0.0422          | 0.9235    | 0.9503 | 0.9367 | 0.9886   |
+| 0.0356        | 0.9507 | 32500 | 0.0404          | 0.9272    | 0.9520 | 0.9395 | 0.9890   |
+| 0.0261        | 1.0238 | 35000 | 0.0381          | 0.9293    | 0.9529 | 0.9409 | 0.9898   |
+| 0.0322        | 1.0970 | 37500 | 0.0371          | 0.9346    | 0.9558 | 0.9451 | 0.9899   |
+| 0.0303        | 1.1701 | 40000 | 0.0374          | 0.9375    | 0.9580 | 0.9476 | 0.9903   |
+| 0.0276        | 1.2432 | 42500 | 0.0378          | 0.9355    | 0.9566 | 0.9460 | 0.9901   |
+| 0.0264        | 1.3164 | 45000 | 0.0353          | 0.9373    | 0.9574 | 0.9472 | 0.9904   |
+| 0.0228        | 1.3895 | 47500 | 0.0366          | 0.9398    | 0.9589 | 0.9493 | 0.9903   |
+| 0.0234        | 1.4626 | 50000 | 0.0343          | 0.9430    | 0.9602 | 0.9516 | 0.9907   |
+| 0.0274        | 1.5358 | 52500 | 0.0339          | 0.9396    | 0.9591 | 0.9492 | 0.9906   |
+| 0.0236        | 1.6089 | 55000 | 0.0324          | 0.9438    | 0.9613 | 0.9525 | 0.9913   |
+| 0.0244        | 1.6820 | 57500 | 0.0322          | 0.9478    | 0.9624 | 0.9551 | 0.9914   |
+| 0.0222        | 1.7552 | 60000 | 0.0323          | 0.9483    | 0.9628 | 0.9555 | 0.9914   |
+| 0.0238        | 1.8283 | 62500 | 0.0320          | 0.9480    | 0.9630 | 0.9554 | 0.9913   |
+| 0.0223        | 1.9014 | 65000 | 0.0320          | 0.9485    | 0.9637 | 0.9560 | 0.9913   |
+| 0.0208        | 1.9746 | 67500 | 0.0306          | 0.9507    | 0.9644 | 0.9575 | 0.9917   |
+### Framework versions
+- Transformers 5.0.0
+- Pytorch 2.10.0+cu128
+- Datasets 4.0.0
+- Tokenizers 0.22.2

config.json ADDED Viewed

	@@ -0,0 +1,456 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "XLMRobertaForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "O",
+    "1": "B-EN",
+    "2": "I-EN",
+    "3": "B-ES",
+    "4": "I-ES",
+    "5": "B-FR",
+    "6": "I-FR",
+    "7": "B-DE",
+    "8": "I-DE",
+    "9": "B-IT",
+    "10": "I-IT",
+    "11": "B-PT",
+    "12": "I-PT",
+    "13": "B-NL",
+    "14": "I-NL",
+    "15": "B-VI",
+    "16": "I-VI",
+    "17": "B-TR",
+    "18": "I-TR",
+    "19": "B-LA",
+    "20": "I-LA",
+    "21": "B-ID",
+    "22": "I-ID",
+    "23": "B-MS",
+    "24": "I-MS",
+    "25": "B-AF",
+    "26": "I-AF",
+    "27": "B-SQ",
+    "28": "I-SQ",
+    "29": "B-IS",
+    "30": "I-IS",
+    "31": "B-NO",
+    "32": "I-NO",
+    "33": "B-SV",
+    "34": "I-SV",
+    "35": "B-DA",
+    "36": "I-DA",
+    "37": "B-FI",
+    "38": "I-FI",
+    "39": "B-HU",
+    "40": "I-HU",
+    "41": "B-PL",
+    "42": "I-PL",
+    "43": "B-CS",
+    "44": "I-CS",
+    "45": "B-RO",
+    "46": "I-RO",
+    "47": "B-RU",
+    "48": "I-RU",
+    "49": "B-BG",
+    "50": "I-BG",
+    "51": "B-UK",
+    "52": "I-UK",
+    "53": "B-SR",
+    "54": "I-SR",
+    "55": "B-BE",
+    "56": "I-BE",
+    "57": "B-KK",
+    "58": "I-KK",
+    "59": "B-MK",
+    "60": "I-MK",
+    "61": "B-MN",
+    "62": "I-MN",
+    "63": "B-ZH",
+    "64": "I-ZH",
+    "65": "B-JA",
+    "66": "I-JA",
+    "67": "B-KO",
+    "68": "I-KO",
+    "69": "B-HI",
+    "70": "I-HI",
+    "71": "B-UR",
+    "72": "I-UR",
+    "73": "B-BN",
+    "74": "I-BN",
+    "75": "B-TA",
+    "76": "I-TA",
+    "77": "B-TE",
+    "78": "I-TE",
+    "79": "B-MR",
+    "80": "I-MR",
+    "81": "B-GU",
+    "82": "I-GU",
+    "83": "B-KN",
+    "84": "I-KN",
+    "85": "B-ML",
+    "86": "I-ML",
+    "87": "B-PA",
+    "88": "I-PA",
+    "89": "B-AS",
+    "90": "I-AS",
+    "91": "B-OR",
+    "92": "I-OR",
+    "93": "B-AR",
+    "94": "I-AR",
+    "95": "B-FA",
+    "96": "I-FA",
+    "97": "B-PS",
+    "98": "I-PS",
+    "99": "B-SD",
+    "100": "I-SD",
+    "101": "B-UG",
+    "102": "I-UG",
+    "103": "B-EL",
+    "104": "I-EL",
+    "105": "B-HE",
+    "106": "I-HE",
+    "107": "B-YI",
+    "108": "I-YI",
+    "109": "B-HY",
+    "110": "I-HY",
+    "111": "B-KA",
+    "112": "I-KA",
+    "113": "B-AM",
+    "114": "I-AM",
+    "115": "B-KM",
+    "116": "I-KM",
+    "117": "B-LO",
+    "118": "I-LO",
+    "119": "B-MY",
+    "120": "I-MY",
+    "121": "B-TH",
+    "122": "I-TH",
+    "123": "B-SI",
+    "124": "I-SI",
+    "125": "B-BO",
+    "126": "I-BO",
+    "127": "B-DV",
+    "128": "I-DV",
+    "129": "B-TI",
+    "130": "I-TI",
+    "131": "B-SW",
+    "132": "I-SW",
+    "133": "B-EU",
+    "134": "I-EU",
+    "135": "B-TL",
+    "136": "I-TL",
+    "137": "B-XH",
+    "138": "I-XH",
+    "139": "B-CA",
+    "140": "I-CA",
+    "141": "B-GL",
+    "142": "I-GL",
+    "143": "B-OC",
+    "144": "I-OC",
+    "145": "B-BR",
+    "146": "I-BR",
+    "147": "B-GA",
+    "148": "I-GA",
+    "149": "B-GD",
+    "150": "I-GD",
+    "151": "B-CY",
+    "152": "I-CY",
+    "153": "B-SCO",
+    "154": "I-SCO",
+    "155": "B-BS",
+    "156": "I-BS",
+    "157": "B-HR",
+    "158": "I-HR",
+    "159": "B-SL",
+    "160": "I-SL",
+    "161": "B-SK",
+    "162": "I-SK",
+    "163": "B-ET",
+    "164": "I-ET",
+    "165": "B-LV",
+    "166": "I-LV",
+    "167": "B-LT",
+    "168": "I-LT",
+    "169": "B-EO",
+    "170": "I-EO",
+    "171": "B-JV",
+    "172": "I-JV",
+    "173": "B-MG",
+    "174": "I-MG",
+    "175": "B-OM",
+    "176": "I-OM",
+    "177": "B-SO",
+    "178": "I-SO",
+    "179": "B-SU",
+    "180": "I-SU",
+    "181": "B-UZ",
+    "182": "I-UZ",
+    "183": "B-KU",
+    "184": "I-KU",
+    "185": "B-CKB",
+    "186": "I-CKB",
+    "187": "B-NE",
+    "188": "I-NE",
+    "189": "B-MT",
+    "190": "I-MT",
+    "191": "B-LB",
+    "192": "I-LB",
+    "193": "B-RM",
+    "194": "I-RM",
+    "195": "B-TT",
+    "196": "I-TT",
+    "197": "B-KY",
+    "198": "I-KY",
+    "199": "B-TG",
+    "200": "I-TG",
+    "201": "B-BA",
+    "202": "I-BA",
+    "203": "B-YO",
+    "204": "I-YO",
+    "205": "B-ZU",
+    "206": "I-ZU",
+    "207": "B-NY",
+    "208": "I-NY",
+    "209": "B-CE",
+    "210": "I-CE"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "label2id": {
+    "B-AF": 25,
+    "B-AM": 113,
+    "B-AR": 93,
+    "B-AS": 89,
+    "B-BA": 201,
+    "B-BE": 55,
+    "B-BG": 49,
+    "B-BN": 73,
+    "B-BO": 125,
+    "B-BR": 145,
+    "B-BS": 155,
+    "B-CA": 139,
+    "B-CE": 209,
+    "B-CKB": 185,
+    "B-CS": 43,
+    "B-CY": 151,
+    "B-DA": 35,
+    "B-DE": 7,
+    "B-DV": 127,
+    "B-EL": 103,
+    "B-EN": 1,
+    "B-EO": 169,
+    "B-ES": 3,
+    "B-ET": 163,
+    "B-EU": 133,
+    "B-FA": 95,
+    "B-FI": 37,
+    "B-FR": 5,
+    "B-GA": 147,
+    "B-GD": 149,
+    "B-GL": 141,
+    "B-GU": 81,
+    "B-HE": 105,
+    "B-HI": 69,
+    "B-HR": 157,
+    "B-HU": 39,
+    "B-HY": 109,
+    "B-ID": 21,
+    "B-IS": 29,
+    "B-IT": 9,
+    "B-JA": 65,
+    "B-JV": 171,
+    "B-KA": 111,
+    "B-KK": 57,
+    "B-KM": 115,
+    "B-KN": 83,
+    "B-KO": 67,
+    "B-KU": 183,
+    "B-KY": 197,
+    "B-LA": 19,
+    "B-LB": 191,
+    "B-LO": 117,
+    "B-LT": 167,
+    "B-LV": 165,
+    "B-MG": 173,
+    "B-MK": 59,
+    "B-ML": 85,
+    "B-MN": 61,
+    "B-MR": 79,
+    "B-MS": 23,
+    "B-MT": 189,
+    "B-MY": 119,
+    "B-NE": 187,
+    "B-NL": 13,
+    "B-NO": 31,
+    "B-NY": 207,
+    "B-OC": 143,
+    "B-OM": 175,
+    "B-OR": 91,
+    "B-PA": 87,
+    "B-PL": 41,
+    "B-PS": 97,
+    "B-PT": 11,
+    "B-RM": 193,
+    "B-RO": 45,
+    "B-RU": 47,
+    "B-SCO": 153,
+    "B-SD": 99,
+    "B-SI": 123,
+    "B-SK": 161,
+    "B-SL": 159,
+    "B-SO": 177,
+    "B-SQ": 27,
+    "B-SR": 53,
+    "B-SU": 179,
+    "B-SV": 33,
+    "B-SW": 131,
+    "B-TA": 75,
+    "B-TE": 77,
+    "B-TG": 199,
+    "B-TH": 121,
+    "B-TI": 129,
+    "B-TL": 135,
+    "B-TR": 17,
+    "B-TT": 195,
+    "B-UG": 101,
+    "B-UK": 51,
+    "B-UR": 71,
+    "B-UZ": 181,
+    "B-VI": 15,
+    "B-XH": 137,
+    "B-YI": 107,
+    "B-YO": 203,
+    "B-ZH": 63,
+    "B-ZU": 205,
+    "I-AF": 26,
+    "I-AM": 114,
+    "I-AR": 94,
+    "I-AS": 90,
+    "I-BA": 202,
+    "I-BE": 56,
+    "I-BG": 50,
+    "I-BN": 74,
+    "I-BO": 126,
+    "I-BR": 146,
+    "I-BS": 156,
+    "I-CA": 140,
+    "I-CE": 210,
+    "I-CKB": 186,
+    "I-CS": 44,
+    "I-CY": 152,
+    "I-DA": 36,
+    "I-DE": 8,
+    "I-DV": 128,
+    "I-EL": 104,
+    "I-EN": 2,
+    "I-EO": 170,
+    "I-ES": 4,
+    "I-ET": 164,
+    "I-EU": 134,
+    "I-FA": 96,
+    "I-FI": 38,
+    "I-FR": 6,
+    "I-GA": 148,
+    "I-GD": 150,
+    "I-GL": 142,
+    "I-GU": 82,
+    "I-HE": 106,
+    "I-HI": 70,
+    "I-HR": 158,
+    "I-HU": 40,
+    "I-HY": 110,
+    "I-ID": 22,
+    "I-IS": 30,
+    "I-IT": 10,
+    "I-JA": 66,
+    "I-JV": 172,
+    "I-KA": 112,
+    "I-KK": 58,
+    "I-KM": 116,
+    "I-KN": 84,
+    "I-KO": 68,
+    "I-KU": 184,
+    "I-KY": 198,
+    "I-LA": 20,
+    "I-LB": 192,
+    "I-LO": 118,
+    "I-LT": 168,
+    "I-LV": 166,
+    "I-MG": 174,
+    "I-MK": 60,
+    "I-ML": 86,
+    "I-MN": 62,
+    "I-MR": 80,
+    "I-MS": 24,
+    "I-MT": 190,
+    "I-MY": 120,
+    "I-NE": 188,
+    "I-NL": 14,
+    "I-NO": 32,
+    "I-NY": 208,
+    "I-OC": 144,
+    "I-OM": 176,
+    "I-OR": 92,
+    "I-PA": 88,
+    "I-PL": 42,
+    "I-PS": 98,
+    "I-PT": 12,
+    "I-RM": 194,
+    "I-RO": 46,
+    "I-RU": 48,
+    "I-SCO": 154,
+    "I-SD": 100,
+    "I-SI": 124,
+    "I-SK": 162,
+    "I-SL": 160,
+    "I-SO": 178,
+    "I-SQ": 28,
+    "I-SR": 54,
+    "I-SU": 180,
+    "I-SV": 34,
+    "I-SW": 132,
+    "I-TA": 76,
+    "I-TE": 78,
+    "I-TG": 200,
+    "I-TH": 122,
+    "I-TI": 130,
+    "I-TL": 136,
+    "I-TR": 18,
+    "I-TT": 196,
+    "I-UG": 102,
+    "I-UK": 52,
+    "I-UR": 72,
+    "I-UZ": 182,
+    "I-VI": 16,
+    "I-XH": 138,
+    "I-YI": 108,
+    "I-YO": 204,
+    "I-ZH": 64,
+    "I-ZU": 206,
+    "O": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "type_vocab_size": 1,
+  "use_cache": false,
+  "vocab_size": 250002
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2455b960fbdfa72ad7b04a55ea6dc99bddb425c54bb3f4023295d9a7470a4d44
+size 1110485292

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a5451f31fe3f899dcd75ec2ad93f415528c9b5f58bb7a5a1c6dd5884fb56257
+size 16781486

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "add_prefix_space": true,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5d81020e6bbd67f3b2229ebb1af43c3661c83bb3947ad30ea05b011fc7aa50
+size 5201