vovobobo commited on
Commit
6d528b7
1 Parent(s): ad223ed

commit files to HF hub

Browse files
.gitattributes CHANGED
@@ -1,34 +1,17 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: zh
3
+ tags:
4
+ - wobert
5
+ inference: False
6
+ ---
7
+ ## 介绍
8
+ ### tf版本
9
+ https://github.com/ZhuiyiTechnology/WoBERT
10
+ ### pytorch版本
11
+ https://github.com/JunnYu/WoBERT_pytorch
12
+
13
+ ## 安装(主要为了安装WoBertTokenizer)
14
+ ```bash
15
+ pip install git+https://github.com/JunnYu/WoBERT_pytorch.git
16
+ ```
17
+
18
+ ## 使用
19
+ ```python
20
+ import torch
21
+ from transformers import BertForMaskedLM as WoBertForMaskedLM
22
+ from wobert import WoBertTokenizer
23
+
24
+ pretrained_model_or_path_list = [
25
+ "junnyu/wobert_chinese_plus_base", "junnyu/wobert_chinese_base"
26
+ ]
27
+ for path in pretrained_model_or_path_list:
28
+ text = "今天[MASK]很好,我[MASK]去公园玩。"
29
+ tokenizer = WoBertTokenizer.from_pretrained(path)
30
+ model = WoBertForMaskedLM.from_pretrained(path)
31
+ inputs = tokenizer(text, return_tensors="pt")
32
+ with torch.no_grad():
33
+ outputs = model(**inputs).logits[0]
34
+ outputs_sentence = ""
35
+ for i, id in enumerate(tokenizer.encode(text)):
36
+ if id == tokenizer.mask_token_id:
37
+ tokens = tokenizer.convert_ids_to_tokens(outputs[i].topk(k=5)[1])
38
+ outputs_sentence += "[" + "||".join(tokens) + "]"
39
+ else:
40
+ outputs_sentence += "".join(
41
+ tokenizer.convert_ids_to_tokens([id],
42
+ skip_special_tokens=True))
43
+ print(outputs_sentence)
44
+ # RoFormer 今天[天气||天||心情||阳光||空气]很好,我[想||要||打算||准备||喜欢]去公园玩。
45
+ # PLUS WoBERT 今天[天气||阳光||天||心情||空气]很好,我[想||要||打算||准备||就]去公园玩。
46
+ # WoBERT 今天[天气||阳光||天||心情||空气]很好,我[想||要||就||准备||也]去公园玩。
47
+ ```
48
+ ## 引用
49
+
50
+ Bibtex:
51
+
52
+ ```tex
53
+ @techreport{zhuiyiwobert,
54
+ title={WoBERT: Word-based Chinese BERT model - ZhuiyiAI},
55
+ author={Jianlin Su},
56
+ year={2020},
57
+ url="https://github.com/ZhuiyiTechnology/WoBERT",
58
+ }
59
+ ```
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "directionality": "bidi",
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "pooler_fc_size": 768,
20
+ "pooler_num_attention_heads": 12,
21
+ "pooler_num_fc_layers": 3,
22
+ "pooler_size_per_head": 128,
23
+ "pooler_type": "first_token_transform",
24
+ "position_embedding_type": "absolute",
25
+ "transformers_version": "4.5.1",
26
+ "type_vocab_size": 2,
27
+ "use_cache": true,
28
+ "vocab_size": 50000
29
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca6f7ec47fa15b29244d234f4d5c360a16016f7f3bfa39dad946f375af1677a3
3
+ size 497978787
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bebcf0185342ef391ac44689481c69296509821bacf46421ff22fcc31cdefc7b
3
+ size 500428391
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "[UNK]",
3
+ "sep_token": "[SEP]",
4
+ "pad_token": "[PAD]",
5
+ "cls_token": "[CLS]",
6
+ "mask_token": "[MASK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_lower_case": true,
3
+ "do_basic_tokenize": true,
4
+ "never_split": null,
5
+ "unk_token": "[UNK]",
6
+ "sep_token": "[SEP]",
7
+ "pad_token": "[PAD]",
8
+ "cls_token": "[CLS]",
9
+ "mask_token": "[MASK]",
10
+ "tokenize_chinese_chars": true,
11
+ "strip_accents": null,
12
+ "special_tokens_map_file": null,
13
+ "tokenizer_file": null,
14
+ "name_or_path": "junnyu/wobert_chinese_plus_base",
15
+ "tokenizer_class": "RoFormerTokenizer"
16
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff