marefa-nlp
/

marefa-ner

@@ -5,8 +5,12 @@ datasets:
 ---
 # Tebyan تبيـان
-# Marefa Arabic Named Entity Recognition Model
-# نموذج المعرفة لتصنيف أجزاء النص
 ## Model description
@@ -25,7 +29,7 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
 Install transformers AND nltk (python >= 3.6)
-`$ pip3 install transformers==4.3.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
 > If you are using `Google Colab`, please restart your runtime after installing the packages.
@@ -33,6 +37,7 @@ Install transformers AND nltk (python >= 3.6)
 ```python
 # we need to install NLTK punkt to be used for word tokenization
 from collections import defaultdict
 import nltk
 nltk.download('punkt')
@@ -41,6 +46,9 @@ from nltk.tokenize import word_tokenize
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from transformers import pipeline
 # ===== import the model
 m_name = "marefa-nlp/marefa-ner"
 tokenizer = AutoTokenizer.from_pretrained(m_name)
@@ -50,49 +58,39 @@ model = AutoModelForTokenClassification.from_pretrained(m_name)
 nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
 # ===== extract the entities from a sample text
-example = 'قاد عمر المختار القوات في ليبيا ضد الجيش الإيطالي'
 # clean the text
 example = " ".join(word_tokenize(example))
 # feed to the NER model to parse
 ner_results = nlp(example)
-# as the [grouped_entities] parameter does not perform well in Arabic,
-# we prepared a simple fixing code to generate full entities tokens
-grouped_ner_results = defaultdict(list)
-fixed_ner_results = []
 for ent in ner_results:
-  grouped_ner_results[ent['entity_group']].append(ent)
-for group, ents in grouped_ner_results.items():
-  if len(ents) == 1:
-    fixed_ner_results.append(ents[0])
-    continue
-  current_ent = {"word": ents[0]['word'], "start": ents[0]['start'], "end": ents[0]['end'], "entity_group": group, "score": ents[0]['score']}
-  for i in range(1, len(ents)):
-    if ents[i]['start'] == current_ent["end"]:
-      current_ent["word"] += ents[i]['word']
-      current_ent["end"] = ents[i]['end']
-      current_ent["score"] = max(ents[i]['score'], current_ent["score"])
-    else:
-      fixed_ner_results.append(current_ent)
-      current_ent = {"word": ents[i]['word'], "start": ents[i]['start'], "end": ents[i]['end'], "entity_group": group, "score": ents[i]['score']}
-  fixed_ner_results.append(current_ent)
-# sort entities
-fixed_ner_results = sorted(fixed_ner_results, key=lambda e: e['start'], reverse=False)
-# ===== print the ner_results
-for ent in fixed_ner_results:
-  print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
 #####
-# عمر المختار -> person  # score: 1.00
-# ليبيا -> location  # score: 0.99
-# الجيش الإيطالي -> organization  # score: 0.99
 ####
 ```
@@ -103,6 +101,7 @@ for ent in fixed_ner_results:
 - على سيد عبد الحفيظ - إشراف
 - نرمين محمد عطيه
 - احمد علي عبدربه
 - عمر بن عبد العزيز سليمان
 - محمد ابراهيم الجمال

 ---
 # Tebyan تبيـان
+## Marefa Arabic Named Entity Recognition Model
+## نموذج المعرفة لتصنيف أجزاء النص
+---------
+**Version**: 1.0.1
+**Last Update:** 16-05-2021
 ## Model description
 Install transformers AND nltk (python >= 3.6)
+`$ pip3 install transformers==4.6.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
 > If you are using `Google Colab`, please restart your runtime after installing the packages.
 ```python
 # we need to install NLTK punkt to be used for word tokenization
+# we need to install NLTK punkt to be used for word tokenization
 from collections import defaultdict
 import nltk
 nltk.download('punkt')
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from transformers import pipeline
+# labels list
+labels_list = ['O', 'B-nationality', 'B-event', 'B-person', 'B-artwork', 'B-location', 'B-product', 'B-organization', 'B-job', 'B-time', 'I-nationality', 'I-event', 'I-person', 'I-artwork', 'I-location', 'I-product', 'I-organization', 'I-job', 'I-time']
 # ===== import the model
 m_name = "marefa-nlp/marefa-ner"
 tokenizer = AutoTokenizer.from_pretrained(m_name)
 nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
 # ===== extract the entities from a sample text
+example = 'خاضت القوات المصرية حرب السادس من أكتوبر ضد الجيش الصهيوني عام 1973'
 # clean the text
 example = " ".join(word_tokenize(example))
 # feed to the NER model to parse
 ner_results = nlp(example)
+# we prepared a simple code to generate full entities tokens
+modified_results = []
 for ent in ner_results:
+  ent["entity_group"] = int(ent["entity_group"].lower().replace("label_",""))
+  ent["entity_group"] = labels_list[ent["entity_group"]]
+  if ent["entity_group"] != "O":
+    if "B-" in ent["entity_group"]:
+      ent["entity_group"] = ent["entity_group"].replace("B-","")
+      modified_results.append(ent)
+    elif "I-" in ent["entity_group"]:
+      ## check related entity-group
+      label = ent["entity_group"].replace("I-","")
+      if len(modified_results) > 0 and label == modified_results[-1]["entity_group"]:
+        modified_results[-1]["word"] += f" {ent['word']}"
+        modified_results[-1]["score"] = sum([modified_results[-1]["score"], ent["score"]])/2
+        modified_results[-1]["end"] = ent["end"]
+for res in modified_results:
+  print(res["word"], "==>", res["entity_group"])
 #####
+# القوات المصرية ==> organization
+# حرب السادس من أكتوبر ==> event
+# الجيش الصهيوني ==> organization
+# عام 1973 ==> time
 ####
 ```
 - على سيد عبد الحفيظ - إشراف
 - نرمين محمد عطيه
+- صلاح خيرالله
 - احمد علي عبدربه
 - عمر بن عبد العزيز سليمان
 - محمد ابراهيم الجمال