bakrianoo commited on
Commit
98ce57c
1 Parent(s): 7b25d84

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +77 -54
README.md CHANGED
@@ -8,9 +8,9 @@ datasets:
8
  ## Marefa Arabic Named Entity Recognition Model
9
  ## نموذج المعرفة لتصنيف أجزاء النص
10
  ---------
11
- **Version**: 1.0.1
12
 
13
- **Last Update:** 16-05-2021
14
 
15
  ## Model description
16
 
@@ -27,70 +27,93 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
27
 
28
  ## How to use كيف تستخدم النموذج
29
 
30
- Install transformers AND nltk (python >= 3.6)
31
 
32
- `$ pip3 install transformers==4.6.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
33
 
34
  > If you are using `Google Colab`, please restart your runtime after installing the packages.
35
 
36
  -----------
37
 
38
  ```python
39
- # we need to install NLTK punkt to be used for word tokenization
40
- # we need to install NLTK punkt to be used for word tokenization
41
- from collections import defaultdict
 
42
  import nltk
43
  nltk.download('punkt')
44
  from nltk.tokenize import word_tokenize
45
 
46
- from transformers import AutoTokenizer, AutoModelForTokenClassification
47
- from transformers import pipeline
48
-
49
- # labels list
50
- labels_list = ['O', 'B-nationality', 'B-event', 'B-person', 'B-artwork', 'B-location', 'B-product', 'B-organization', 'B-job', 'B-time', 'I-nationality', 'I-event', 'I-person', 'I-artwork', 'I-location', 'I-product', 'I-organization', 'I-job', 'I-time']
51
-
52
- # ===== import the model
53
- m_name = "marefa-nlp/marefa-ner"
54
- tokenizer = AutoTokenizer.from_pretrained(m_name)
55
- model = AutoModelForTokenClassification.from_pretrained(m_name)
56
-
57
- # ===== build the NER pipeline
58
- nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
59
-
60
- # ===== extract the entities from a sample text
61
- example = 'خاضت القوات المصرية حرب السادس من أكتوبر ضد الجيش الصهيوني عام 1973'
62
- # clean the text
63
- example = " ".join(word_tokenize(example))
64
- # feed to the NER model to parse
65
- ner_results = nlp(example)
66
-
67
- # we prepared a simple code to generate full entities tokens
68
-
69
- modified_results = []
70
- for ent in ner_results:
71
- if ent["entity_group"].lower().replace("label_","").isnumeric():
72
- ent["entity_group"] = int(ent["entity_group"].lower().replace("label_",""))
73
- ent["entity_group"] = labels_list[ent["entity_group"]]
74
-
75
- if len(modified_results) > 0 and ent["start"] == modified_results[-1]["end"]:
76
- modified_results[-1]["word"] += f"{ent['word']}".replace("▁"," ").strip()
77
- modified_results[-1]["word"] = modified_results[-1]["word"].replace("▁"," ").strip()
78
- modified_results[-1]["score"] = sum([modified_results[-1]["score"], ent["score"]])/2
79
- modified_results[-1]["end"] = ent["end"]
80
- else:
81
- modified_results.append(ent)
82
-
83
-
84
- for res in modified_results:
85
- print(res["word"], "==>", res["entity_group"])
 
 
 
 
 
 
86
 
87
- #####
88
- # القوات المصرية ==> organization
89
- # حرب السادس من أكتوبر ==> event
90
- # الجيش الصهيوني ==> organization
91
- # عام 1973 ==> time
92
- ####
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ```
95
 
96
  ## Acknowledgment شكر و تقدير
 
8
  ## Marefa Arabic Named Entity Recognition Model
9
  ## نموذج المعرفة لتصنيف أجزاء النص
10
  ---------
11
+ **Version**: 1.2
12
 
13
+ **Last Update:** 22-05-2021
14
 
15
  ## Model description
16
 
 
27
 
28
  ## How to use كيف تستخدم النموذج
29
 
30
+ Install the following Python packages
31
 
32
+ `$ pip3 install simpletransformers==0.61.5 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
33
 
34
  > If you are using `Google Colab`, please restart your runtime after installing the packages.
35
 
36
  -----------
37
 
38
  ```python
39
+ from simpletransformers.ner import NERModel, NERArgs
40
+ import logging
41
+ import re
42
+
43
  import nltk
44
  nltk.download('punkt')
45
  from nltk.tokenize import word_tokenize
46
 
47
+ logging.basicConfig(level=logging.INFO)
48
+ transformers_logger = logging.getLogger("transformers")
49
+ transformers_logger.setLevel(logging.WARNING)
50
+
51
+ # Load the Model
52
+ custom_labels = ["O", "B-job", "I-job", "B-nationality", "B-person", "I-person", "B-location",
53
+ "B-time", "I-time", "B-event", "I-event", "B-organization", "I-organization",
54
+ "I-location", "I-nationality", "B-product", "I-product", "B-artwork", "I-artwork"]
55
+
56
+ model_args = NERArgs()
57
+ model_args.labels_list=custom_labels
58
+
59
+ ner_model = NERModel(
60
+ "xlmroberta", "marefa-nlp/marefa-ner",
61
+ args=model_args,
62
+ use_cuda=True # set to False to use CPU
63
+ )
64
+
65
+ # Model Inference
66
+ samples = [
67
+ "تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م. تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده",
68
+ "بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته"
69
+ ]
70
+
71
+ # Preprocess
72
+ samples = [ " ".join(word_tokenize(sample.strip())) for sample in samples if sample.strip() != "" ]
73
+
74
+ # Predict
75
+ predictions, raw_outputs = ner_model.predict(samples)
76
+
77
+ # Group the Predicted Entities
78
+ entities = []
79
+ for pred in predictions:
80
+ grouped_entities = []
81
+ for rec in pred:
82
+
83
+ token = list(rec.keys())[0]
84
+ label = rec[token]
85
+
86
+ if label == "O":
87
+ continue
88
+
89
+ if "B-" in label:
90
+ grouped_entities.append({"token": token, "label": label.replace("B-","")})
91
+ elif "I-" in label and len(grouped_entities) > 0:
92
+ grouped_entities[-1]["token"] += f" {token}"
93
 
94
+ entities.append(grouped_entities)
95
+
96
+ # Print the model outputs
97
+ for sample, results in zip(samples, entities):
98
+ print(sample)
99
+ for res in results:
100
+ print("\t", res["token"], "=>", res["label"])
101
+ print("==================")
102
+
103
+ ###
104
+ # تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م . تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده
105
+ # الأزهر => organization
106
+ # عام 1873م => time
107
+ # جمال الدين الأفغاني => person
108
+ # محمد عبده => person
109
+ # ==================
110
+ # بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته
111
+ # القاهرة، => location
112
+ # نجيب الريحاني => person
113
+ # فرقة جورج أبيض، => organization
114
+ # فرقة سلامة حجازي => organization
115
+ # ==================
116
+ ###
117
  ```
118
 
119
  ## Acknowledgment شكر و تقدير