shibing624
commited on
Commit
•
e3792e1
1
Parent(s):
1dd5da2
Update README.md
Browse files
README.md
CHANGED
@@ -48,6 +48,66 @@ bert4ner-base-chinese
|
|
48 |
└── vocab.txt
|
49 |
```
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
### 训练数据集
|
52 |
#### 中文实体识别数据集
|
53 |
|
|
|
48 |
└── vocab.txt
|
49 |
```
|
50 |
|
51 |
+
## Usage (HuggingFace Transformers)
|
52 |
+
Without [nerpy](https://github.com/shibing624/nerpy), you can use the model like this:
|
53 |
+
|
54 |
+
First, you pass your input through the transformer model, then you have to apply the bio tag to get the entity words.
|
55 |
+
|
56 |
+
Install package:
|
57 |
+
```
|
58 |
+
pip install transformers seqeval
|
59 |
+
```
|
60 |
+
|
61 |
+
```python
|
62 |
+
import os
|
63 |
+
import torch
|
64 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
65 |
+
from seqeval.metrics.sequence_labeling import get_entities
|
66 |
+
|
67 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
68 |
+
|
69 |
+
# Load model from HuggingFace Hub
|
70 |
+
tokenizer = AutoTokenizer.from_pretrained("shibing624/bert4ner-base-chinese")
|
71 |
+
model = AutoModelForTokenClassification.from_pretrained("shibing624/bert4ner-base-chinese")
|
72 |
+
label_list = ['I-ORG', 'B-LOC', 'O', 'B-ORG', 'I-LOC', 'I-PER', 'B-TIME', 'I-TIME', 'B-PER']
|
73 |
+
|
74 |
+
sentence = "王宏伟来自北京,是个警察,喜欢去王府井游玩儿。"
|
75 |
+
|
76 |
+
|
77 |
+
def get_entity(sentence):
|
78 |
+
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
|
79 |
+
inputs = tokenizer.encode(sentence, return_tensors="pt")
|
80 |
+
with torch.no_grad():
|
81 |
+
outputs = model(inputs).logits
|
82 |
+
predictions = torch.argmax(outputs, dim=2)
|
83 |
+
char_tags = [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())][1:-1]
|
84 |
+
print(sentence)
|
85 |
+
print(char_tags)
|
86 |
+
|
87 |
+
pred_labels = [i[1] for i in char_tags]
|
88 |
+
entities = []
|
89 |
+
line_entities = get_entities(pred_labels)
|
90 |
+
for i in line_entities:
|
91 |
+
word = sentence[i[1]: i[2] + 1]
|
92 |
+
entity_type = i[0]
|
93 |
+
entities.append((word, entity_type))
|
94 |
+
|
95 |
+
print("Sentence entity:")
|
96 |
+
print(entities)
|
97 |
+
|
98 |
+
|
99 |
+
get_entity(sentence)
|
100 |
+
```
|
101 |
+
|
102 |
+
output:
|
103 |
+
```shell
|
104 |
+
王宏伟来自北京,是个警察,喜欢去王府井游玩儿。
|
105 |
+
[('王', 'B-PER'), ('宏', 'I-PER'), ('伟', 'I-PER'), ('来', 'O'), ('自', 'O'), ('北', 'B-LOC'), ('京', 'I-LOC'), (',', 'O'), ('是', 'O'), ('个', 'O'), ('警', 'O'), ('察', 'O'), (',', 'O'), ('喜', 'O'), ('欢', 'O'), ('去', 'O'), ('王', 'B-LOC'), ('府', 'I-LOC'), ('井', 'I-LOC'), ('游', 'O'), ('玩', 'O'), ('儿', 'O'), ('。', 'O')]
|
106 |
+
Sentence entity:
|
107 |
+
[('王宏伟', 'PER'), ('北京', 'LOC'), ('王府井', 'LOC')]
|
108 |
+
```
|
109 |
+
|
110 |
+
|
111 |
### 训练数据集
|
112 |
#### 中文实体识别数据集
|
113 |
|