coppercitylabs commited on
Commit
48237fb
1 Parent(s): af55ee8
LICENSE.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2021 CopperCityLabs
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a
4
+ copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included
12
+ in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: uz
3
+ tags:
4
+ - uzbek
5
+ - cyrillic
6
+ - news category classifier
7
+ license: MIT
8
+ datasets:
9
+ - webcrawl
10
+ ---
11
+
12
+ # Uzbek news category classifier (based on UzBERT)
13
+
14
+ UzBERT fine-tuned to classify news articles into one of the following
15
+ categories:
16
+
17
+ - дунё
18
+ - жамият
19
+ - жиноят
20
+ - иқтисодиёт
21
+ - маданият
22
+ - реклама
23
+ - саломатлик
24
+ - сиёсат
25
+ - спорт
26
+ - фан ва техника
27
+ - шоу-бизнес
28
+
29
+ ## How to use
30
+
31
+ ```python
32
+ >>> from transformers import pipeline
33
+ >>> classifier = pipeline('text-classification', model='coppercitylabs/uzbek-news-category-classifier')
34
+ >>> text = """Маҳоратли пара-енгил атлетикачимиз Ҳусниддин Норбеков Токио-2020 Паралимпия ўйинларида ғалаба қозониб, делегациямиз ҳисобига навбатдаги олтин медални келтирди. Бу ҳақда МОҚ хабар берди.
35
+
36
+ Норбеков ҳозиргина ядро улоқтириш дастурида ўз ғалабасини тантана қилди. Ушбу машқда вакилимиз 16:13 метр натижа билан энг яхши кўрсаткични қайд этди.
37
+
38
+ Шу тариқа, делегациямиз ҳисобидаги медаллар сони 16 (6 та олтин, 4 та кумуш ва 6 та бронза) тага етди. Кейинги кун дастурларида иштирок этадиган ҳамюртларимизга омад тилаб қоламиз!"""
39
+ >>> classifier(text)
40
+ [{'label': 'спорт', 'score': 0.9865401983261108}]
41
+ ```
42
+
43
+ ## Fine-tuning data
44
+ Fine-tuned on ~60K news articles for 3 epochs.
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/b/workspace/nlp/nlp-showcase/out/02-news-cateogry-classifier//checkpoint-2886",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "\u0441\u0430\u043b\u043e\u043c\u0430\u0442\u043b\u0438\u043a",
13
+ "1": "\u0436\u0438\u043d\u043e\u044f\u0442",
14
+ "2": "\u0441\u0438\u0451\u0441\u0430\u0442",
15
+ "3": "\u043c\u0430\u0434\u0430\u043d\u0438\u044f\u0442",
16
+ "4": "\u0444\u0430\u043d \u0432\u0430 \u0442\u0435\u0445\u043d\u0438\u043a\u0430",
17
+ "5": "\u0434\u0443\u043d\u0451",
18
+ "6": "\u0441\u043f\u043e\u0440\u0442",
19
+ "7": "\u0436\u0430\u043c\u0438\u044f\u0442",
20
+ "8": "\u0438\u049b\u0442\u0438\u0441\u043e\u0434\u0438\u0451\u0442",
21
+ "9": "\u0440\u0435\u043a\u043b\u0430\u043c\u0430",
22
+ "10": "\u0448\u043e\u0443-\u0431\u0438\u0437\u043d\u0435\u0441"
23
+ },
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 3072,
26
+ "label2id": {
27
+ "\u0434\u0443\u043d\u0451": 5,
28
+ "\u0436\u0430\u043c\u0438\u044f\u0442": 7,
29
+ "\u0436\u0438\u043d\u043e\u044f\u0442": 1,
30
+ "\u0438\u049b\u0442\u0438\u0441\u043e\u0434\u0438\u0451\u0442": 8,
31
+ "\u043c\u0430\u0434\u0430\u043d\u0438\u044f\u0442": 3,
32
+ "\u0440\u0435\u043a\u043b\u0430\u043c\u0430": 9,
33
+ "\u0441\u0430\u043b\u043e\u043c\u0430\u0442\u043b\u0438\u043a": 0,
34
+ "\u0441\u0438\u0451\u0441\u0430\u0442": 2,
35
+ "\u0441\u043f\u043e\u0440\u0442": 6,
36
+ "\u0444\u0430\u043d \u0432\u0430 \u0442\u0435\u0445\u043d\u0438\u043a\u0430": 4,
37
+ "\u0448\u043e\u0443-\u0431\u0438\u0437\u043d\u0435\u0441": 10
38
+ },
39
+ "layer_norm_eps": 1e-12,
40
+ "max_position_embeddings": 512,
41
+ "model_type": "bert",
42
+ "num_attention_heads": 12,
43
+ "num_hidden_layers": 12,
44
+ "pad_token_id": 0,
45
+ "position_embedding_type": "absolute",
46
+ "problem_type": "single_label_classification",
47
+ "torch_dtype": "float32",
48
+ "transformers_version": "4.9.2",
49
+ "type_vocab_size": 2,
50
+ "use_cache": true,
51
+ "vocab_size": 30000
52
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eadda71568f57657e656dadf9db0429a7dcb75232d7225a019282c3438223c81
3
+ size 436440493
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "coppercitylabs/uzbert-base-uncased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff