coppercitylabs commited on
Commit
3047552
1 Parent(s): f772022
Files changed (7) hide show
  1. LICENSE.md +20 -0
  2. README.md +62 -0
  3. config.json +24 -0
  4. pytorch_model.bin +3 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +1 -0
  7. vocab.txt +0 -0
LICENSE.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2021 CopperCityLabs
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a
4
+ copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included
12
+ in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: uz (cyrl)
3
+ tags:
4
+ - uzbert
5
+ - uzbek
6
+ - bert
7
+ license: MIT
8
+ datasets:
9
+ - webcrawl corpus (~142M words)
10
+ ---
11
+
12
+ # UzBERT base model (uncased)
13
+
14
+ Pretrained model on Uzbek language (Cyrillic script) using a masked
15
+ language modeling and next sentence prediction objectives.
16
+
17
+ ### How to use
18
+
19
+ You can use this model directly with a pipeline for masked language modeling:
20
+
21
+ ```python
22
+ >>> from transformers import pipeline
23
+ >>> unmasker = pipeline('fill-mask', model='coppercitylabs/uzbert-base-uncased')
24
+ >>> unmasker("Алишер Навоий – улуғ ўзбек ва бошқа туркий халқларнинг [MASK], мутафаккири ва давлат арбоби бўлган.")
25
+
26
+ [
27
+ {
28
+ 'token_str': 'шоири',
29
+ 'token': 13587,
30
+ 'score': 0.7974384427070618,
31
+ 'sequence': 'алишер навоий – улуғ ўзбек ва бошқа туркий халқларнинг шоири, мутафаккир ##и ва давлат арбоби бўлган.'
32
+ },
33
+ {
34
+ 'token_str': 'олими',
35
+ 'token': 18500,
36
+ 'score': 0.09166576713323593,
37
+ 'sequence': 'алишер навоий – улуғ ўзбек ва бошқа туркий халқларнинг олими, мутафаккир ##и ва давлат арбоби бўлган.'
38
+ },
39
+ {
40
+ 'token_str': 'асосчиси',
41
+ 'token': 7469,
42
+ 'score': 0.02451123297214508,
43
+ 'sequence': 'алишер навоий – улуғ ўзбек ва бошқа туркий халқларнинг асосчиси, мутафаккир ##и ва давлат арбоби бўлган.'
44
+ },
45
+ {
46
+ 'token_str': 'ёзувчиси',
47
+ 'token': 22439,
48
+ 'score': 0.017601722851395607,
49
+ 'sequence': 'алишер навоий – улуғ ўзбек ва бошқа туркий халқларнинг ёзувчиси, мутафаккир ##и ва давлат арбоби бўлган.'
50
+ },
51
+ {
52
+ 'token_str': 'устози',
53
+ 'token': 11494,
54
+ 'score': 0.010115668177604675,
55
+ 'sequence': 'алишер навоий – улуғ ўзбек ва бошқа туркий халқларнинг устози, мутафаккир ##и ва давлат арбоби бўлган.'
56
+ }
57
+ ]
58
+ ```
59
+
60
+ ## Training data
61
+
62
+ UzBERT model was pretrained on ~625K news articles.
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/b/uzl/src/bert/../..//out/bert/model-1/",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.8.2",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 30000
24
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d626d7b124b38437492d465aaab46841d1a876427d4be2cac427e7060b9ac7
3
+ size 436536363
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "/home/b/uzl/src/bert/../..//out/bert/", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff