marefa-nlp
/

marefa-ner

@@ -8,9 +8,9 @@ datasets:
 ## Marefa Arabic Named Entity Recognition Model
 ## نموذج المعرفة لتصنيف أجزاء النص
 ---------
-**Version**: 1.0.1
-**Last Update:** 16-05-2021
 ## Model description
@@ -27,70 +27,93 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
 ## How to use كيف تستخدم النموذج
-Install transformers AND nltk (python >= 3.6)
-`$ pip3 install transformers==4.6.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
 > If you are using `Google Colab`, please restart your runtime after installing the packages.
 -----------
 ```python
-# we need to install NLTK punkt to be used for word tokenization
-# we need to install NLTK punkt to be used for word tokenization
-from collections import defaultdict
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-from transformers import pipeline
-# labels list
-labels_list = ['O', 'B-nationality', 'B-event', 'B-person', 'B-artwork', 'B-location', 'B-product', 'B-organization', 'B-job', 'B-time', 'I-nationality', 'I-event', 'I-person', 'I-artwork', 'I-location', 'I-product', 'I-organization', 'I-job', 'I-time']
-# ===== import the model
-m_name = "marefa-nlp/marefa-ner"
-tokenizer = AutoTokenizer.from_pretrained(m_name)
-model = AutoModelForTokenClassification.from_pretrained(m_name)
-# ===== build the NER pipeline
-nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
-# ===== extract the entities from a sample text
-example = 'خاضت القوات المصرية حرب السادس من أكتوبر ضد الجيش الصهيوني عام 1973'
-# clean the text
-example = " ".join(word_tokenize(example))
-# feed to the NER model to parse
-ner_results = nlp(example)
-# we prepared a simple code to generate full entities tokens
-modified_results = []
-for ent in ner_results:
-  if ent["entity_group"].lower().replace("label_","").isnumeric():
-      ent["entity_group"] = int(ent["entity_group"].lower().replace("label_",""))
-      ent["entity_group"] = labels_list[ent["entity_group"]]
-  if len(modified_results) > 0 and ent["start"] == modified_results[-1]["end"]:
-    modified_results[-1]["word"] += f"{ent['word']}".replace("▁"," ").strip()
-    modified_results[-1]["word"] = modified_results[-1]["word"].replace("▁"," ").strip()
-    modified_results[-1]["score"] = sum([modified_results[-1]["score"], ent["score"]])/2
-    modified_results[-1]["end"] = ent["end"]
-  else:
-    modified_results.append(ent)
-for res in modified_results:
-  print(res["word"], "==>", res["entity_group"])
-#####
-# القوات المصرية ==> organization
-# حرب السادس من أكتوبر ==> event
-# الجيش الصهيوني ==> organization
-# عام 1973 ==> time
-####
 ```
 ## Acknowledgment شكر و تقدير

 ## Marefa Arabic Named Entity Recognition Model
 ## نموذج المعرفة لتصنيف أجزاء النص
 ---------
+**Version**: 1.2
+**Last Update:** 22-05-2021
 ## Model description
 ## How to use كيف تستخدم النموذج
+Install the following Python packages
+`$ pip3 install simpletransformers==0.61.5 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
 > If you are using `Google Colab`, please restart your runtime after installing the packages.
 -----------
 ```python
+from simpletransformers.ner import NERModel, NERArgs
+import logging
+import re
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
+logging.basicConfig(level=logging.INFO)
+transformers_logger = logging.getLogger("transformers")
+transformers_logger.setLevel(logging.WARNING)
+# Load the Model
+custom_labels = ["O", "B-job", "I-job", "B-nationality", "B-person", "I-person", "B-location",
+                 "B-time", "I-time", "B-event", "I-event", "B-organization", "I-organization",
+                 "I-location", "I-nationality", "B-product", "I-product", "B-artwork", "I-artwork"]
+model_args = NERArgs()
+model_args.labels_list=custom_labels
+ner_model = NERModel(
+     "xlmroberta", "marefa-nlp/marefa-ner",
+     args=model_args,
+     use_cuda=True # set to False to use CPU
+)
+# Model Inference
+samples = [
+    "تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م. تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده",
+    "بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته"
+]
+# Preprocess
+samples = [ " ".join(word_tokenize(sample.strip())) for sample in samples if sample.strip() != "" ]
+# Predict
+predictions, raw_outputs = ner_model.predict(samples)
+# Group the Predicted Entities
+entities = []
+for pred in predictions:
+  grouped_entities = []
+  for rec in pred:
+    token = list(rec.keys())[0]
+    label = rec[token]
+    if label == "O":
+      continue
+    if "B-" in label:
+      grouped_entities.append({"token": token, "label": label.replace("B-","")})
+    elif "I-" in label and len(grouped_entities) > 0:
+      grouped_entities[-1]["token"] += f" {token}"
+  entities.append(grouped_entities)
+# Print the model outputs
+for sample, results in zip(samples, entities):
+  print(sample)
+  for res in results:
+    print("\t", res["token"], "=>", res["label"])
+  print("==================")
+ ###
+# تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م . تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده
+# 	 الأزهر => organization
+# 	 عام 1873م => time
+# 	 جمال الدين الأفغاني => person
+# 	 محمد عبده => person
+# ==================
+# بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته
+# 	 القاهرة، => location
+# 	 نجيب الريحاني => person
+# 	 فرقة جورج أبيض، => organization
+# 	 فرقة سلامة حجازي => organization
+# ==================
+ ###
 ```
 ## Acknowledgment شكر و تقدير