Update README.md
Browse files
README.md
CHANGED
@@ -5,8 +5,12 @@ datasets:
|
|
5 |
---
|
6 |
|
7 |
# Tebyan تبيـان
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
|
11 |
## Model description
|
12 |
|
@@ -25,7 +29,7 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
|
|
25 |
|
26 |
Install transformers AND nltk (python >= 3.6)
|
27 |
|
28 |
-
`$ pip3 install transformers==4.
|
29 |
|
30 |
> If you are using `Google Colab`, please restart your runtime after installing the packages.
|
31 |
|
@@ -33,6 +37,7 @@ Install transformers AND nltk (python >= 3.6)
|
|
33 |
|
34 |
```python
|
35 |
# we need to install NLTK punkt to be used for word tokenization
|
|
|
36 |
from collections import defaultdict
|
37 |
import nltk
|
38 |
nltk.download('punkt')
|
@@ -41,6 +46,9 @@ from nltk.tokenize import word_tokenize
|
|
41 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
42 |
from transformers import pipeline
|
43 |
|
|
|
|
|
|
|
44 |
# ===== import the model
|
45 |
m_name = "marefa-nlp/marefa-ner"
|
46 |
tokenizer = AutoTokenizer.from_pretrained(m_name)
|
@@ -50,49 +58,39 @@ model = AutoModelForTokenClassification.from_pretrained(m_name)
|
|
50 |
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
51 |
|
52 |
# ===== extract the entities from a sample text
|
53 |
-
example = '
|
54 |
# clean the text
|
55 |
example = " ".join(word_tokenize(example))
|
56 |
# feed to the NER model to parse
|
57 |
ner_results = nlp(example)
|
58 |
|
59 |
-
#
|
60 |
-
# we prepared a simple fixing code to generate full entities tokens
|
61 |
|
62 |
-
|
63 |
-
fixed_ner_results = []
|
64 |
for ent in ner_results:
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
fixed_ner_results.append(current_ent)
|
84 |
-
|
85 |
-
# sort entities
|
86 |
-
fixed_ner_results = sorted(fixed_ner_results, key=lambda e: e['start'], reverse=False)
|
87 |
-
|
88 |
-
# ===== print the ner_results
|
89 |
-
for ent in fixed_ner_results:
|
90 |
-
print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
|
91 |
|
92 |
#####
|
93 |
-
#
|
94 |
-
#
|
95 |
-
# الجيش
|
|
|
96 |
####
|
97 |
|
98 |
```
|
@@ -103,6 +101,7 @@ for ent in fixed_ner_results:
|
|
103 |
|
104 |
- على سيد عبد الحفيظ - إشراف
|
105 |
- نرمين محمد عطيه
|
|
|
106 |
- احمد علي عبدربه
|
107 |
- عمر بن عبد العزيز سليمان
|
108 |
- محمد ابراهيم الجمال
|
|
|
5 |
---
|
6 |
|
7 |
# Tebyan تبيـان
|
8 |
+
## Marefa Arabic Named Entity Recognition Model
|
9 |
+
## نموذج المعرفة لتصنيف أجزاء النص
|
10 |
+
---------
|
11 |
+
**Version**: 1.0.1
|
12 |
+
|
13 |
+
**Last Update:** 16-05-2021
|
14 |
|
15 |
## Model description
|
16 |
|
|
|
29 |
|
30 |
Install transformers AND nltk (python >= 3.6)
|
31 |
|
32 |
+
`$ pip3 install transformers==4.6.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
|
33 |
|
34 |
> If you are using `Google Colab`, please restart your runtime after installing the packages.
|
35 |
|
|
|
37 |
|
38 |
```python
|
39 |
# we need to install NLTK punkt to be used for word tokenization
|
40 |
+
# we need to install NLTK punkt to be used for word tokenization
|
41 |
from collections import defaultdict
|
42 |
import nltk
|
43 |
nltk.download('punkt')
|
|
|
46 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
47 |
from transformers import pipeline
|
48 |
|
49 |
+
# labels list
|
50 |
+
labels_list = ['O', 'B-nationality', 'B-event', 'B-person', 'B-artwork', 'B-location', 'B-product', 'B-organization', 'B-job', 'B-time', 'I-nationality', 'I-event', 'I-person', 'I-artwork', 'I-location', 'I-product', 'I-organization', 'I-job', 'I-time']
|
51 |
+
|
52 |
# ===== import the model
|
53 |
m_name = "marefa-nlp/marefa-ner"
|
54 |
tokenizer = AutoTokenizer.from_pretrained(m_name)
|
|
|
58 |
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
59 |
|
60 |
# ===== extract the entities from a sample text
|
61 |
+
example = 'خاضت القوات المصرية حرب السادس من أكتوبر ضد الجيش الصهيوني عام 1973'
|
62 |
# clean the text
|
63 |
example = " ".join(word_tokenize(example))
|
64 |
# feed to the NER model to parse
|
65 |
ner_results = nlp(example)
|
66 |
|
67 |
+
# we prepared a simple code to generate full entities tokens
|
|
|
68 |
|
69 |
+
modified_results = []
|
|
|
70 |
for ent in ner_results:
|
71 |
+
ent["entity_group"] = int(ent["entity_group"].lower().replace("label_",""))
|
72 |
+
ent["entity_group"] = labels_list[ent["entity_group"]]
|
73 |
+
|
74 |
+
if ent["entity_group"] != "O":
|
75 |
+
if "B-" in ent["entity_group"]:
|
76 |
+
ent["entity_group"] = ent["entity_group"].replace("B-","")
|
77 |
+
modified_results.append(ent)
|
78 |
+
elif "I-" in ent["entity_group"]:
|
79 |
+
## check related entity-group
|
80 |
+
label = ent["entity_group"].replace("I-","")
|
81 |
+
if len(modified_results) > 0 and label == modified_results[-1]["entity_group"]:
|
82 |
+
modified_results[-1]["word"] += f" {ent['word']}"
|
83 |
+
modified_results[-1]["score"] = sum([modified_results[-1]["score"], ent["score"]])/2
|
84 |
+
modified_results[-1]["end"] = ent["end"]
|
85 |
+
|
86 |
+
for res in modified_results:
|
87 |
+
print(res["word"], "==>", res["entity_group"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
#####
|
90 |
+
# القوات المصرية ==> organization
|
91 |
+
# حرب السادس من أكتوبر ==> event
|
92 |
+
# الجيش الصهيوني ==> organization
|
93 |
+
# عام 1973 ==> time
|
94 |
####
|
95 |
|
96 |
```
|
|
|
101 |
|
102 |
- على سيد عبد الحفيظ - إشراف
|
103 |
- نرمين محمد عطيه
|
104 |
+
- صلاح خيرالله
|
105 |
- احمد علي عبدربه
|
106 |
- عمر بن عبد العزيز سليمان
|
107 |
- محمد ابراهيم الجمال
|