bakrianoo commited on
Commit
8845411
1 Parent(s): 4949bcf

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +36 -37
README.md CHANGED
@@ -5,8 +5,12 @@ datasets:
5
  ---
6
 
7
  # Tebyan تبيـان
8
- # Marefa Arabic Named Entity Recognition Model
9
- # نموذج المعرفة لتصنيف أجزاء النص
 
 
 
 
10
 
11
  ## Model description
12
 
@@ -25,7 +29,7 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
25
 
26
  Install transformers AND nltk (python >= 3.6)
27
 
28
- `$ pip3 install transformers==4.3.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
29
 
30
  > If you are using `Google Colab`, please restart your runtime after installing the packages.
31
 
@@ -33,6 +37,7 @@ Install transformers AND nltk (python >= 3.6)
33
 
34
  ```python
35
  # we need to install NLTK punkt to be used for word tokenization
 
36
  from collections import defaultdict
37
  import nltk
38
  nltk.download('punkt')
@@ -41,6 +46,9 @@ from nltk.tokenize import word_tokenize
41
  from transformers import AutoTokenizer, AutoModelForTokenClassification
42
  from transformers import pipeline
43
 
 
 
 
44
  # ===== import the model
45
  m_name = "marefa-nlp/marefa-ner"
46
  tokenizer = AutoTokenizer.from_pretrained(m_name)
@@ -50,49 +58,39 @@ model = AutoModelForTokenClassification.from_pretrained(m_name)
50
  nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
51
 
52
  # ===== extract the entities from a sample text
53
- example = 'قاد عمر المختار القوات في ليبيا ضد الجيش الإيطالي'
54
  # clean the text
55
  example = " ".join(word_tokenize(example))
56
  # feed to the NER model to parse
57
  ner_results = nlp(example)
58
 
59
- # as the [grouped_entities] parameter does not perform well in Arabic,
60
- # we prepared a simple fixing code to generate full entities tokens
61
 
62
- grouped_ner_results = defaultdict(list)
63
- fixed_ner_results = []
64
  for ent in ner_results:
65
- grouped_ner_results[ent['entity_group']].append(ent)
66
-
67
-
68
- for group, ents in grouped_ner_results.items():
69
- if len(ents) == 1:
70
- fixed_ner_results.append(ents[0])
71
- continue
72
-
73
- current_ent = {"word": ents[0]['word'], "start": ents[0]['start'], "end": ents[0]['end'], "entity_group": group, "score": ents[0]['score']}
74
- for i in range(1, len(ents)):
75
- if ents[i]['start'] == current_ent["end"]:
76
- current_ent["word"] += ents[i]['word']
77
- current_ent["end"] = ents[i]['end']
78
- current_ent["score"] = max(ents[i]['score'], current_ent["score"])
79
- else:
80
- fixed_ner_results.append(current_ent)
81
- current_ent = {"word": ents[i]['word'], "start": ents[i]['start'], "end": ents[i]['end'], "entity_group": group, "score": ents[i]['score']}
82
-
83
- fixed_ner_results.append(current_ent)
84
-
85
- # sort entities
86
- fixed_ner_results = sorted(fixed_ner_results, key=lambda e: e['start'], reverse=False)
87
-
88
- # ===== print the ner_results
89
- for ent in fixed_ner_results:
90
- print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
91
 
92
  #####
93
- # عمر المختار -> person # score: 1.00
94
- # ليبيا -> location # score: 0.99
95
- # الجيش الإيطالي -> organization # score: 0.99
 
96
  ####
97
 
98
  ```
@@ -103,6 +101,7 @@ for ent in fixed_ner_results:
103
 
104
  - على سيد عبد الحفيظ - إشراف
105
  - نرمين محمد عطيه
 
106
  - احمد علي عبدربه
107
  - عمر بن عبد العزيز سليمان
108
  - محمد ابراهيم الجمال
 
5
  ---
6
 
7
  # Tebyan تبيـان
8
+ ## Marefa Arabic Named Entity Recognition Model
9
+ ## نموذج المعرفة لتصنيف أجزاء النص
10
+ ---------
11
+ **Version**: 1.0.1
12
+
13
+ **Last Update:** 16-05-2021
14
 
15
  ## Model description
16
 
 
29
 
30
  Install transformers AND nltk (python >= 3.6)
31
 
32
+ `$ pip3 install transformers==4.6.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
33
 
34
  > If you are using `Google Colab`, please restart your runtime after installing the packages.
35
 
 
37
 
38
  ```python
39
  # we need to install NLTK punkt to be used for word tokenization
40
+ # we need to install NLTK punkt to be used for word tokenization
41
  from collections import defaultdict
42
  import nltk
43
  nltk.download('punkt')
 
46
  from transformers import AutoTokenizer, AutoModelForTokenClassification
47
  from transformers import pipeline
48
 
49
+ # labels list
50
+ labels_list = ['O', 'B-nationality', 'B-event', 'B-person', 'B-artwork', 'B-location', 'B-product', 'B-organization', 'B-job', 'B-time', 'I-nationality', 'I-event', 'I-person', 'I-artwork', 'I-location', 'I-product', 'I-organization', 'I-job', 'I-time']
51
+
52
  # ===== import the model
53
  m_name = "marefa-nlp/marefa-ner"
54
  tokenizer = AutoTokenizer.from_pretrained(m_name)
 
58
  nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
59
 
60
  # ===== extract the entities from a sample text
61
+ example = 'خاضت القوات المصرية حرب السادس من أكتوبر ضد الجيش الصهيوني عام 1973'
62
  # clean the text
63
  example = " ".join(word_tokenize(example))
64
  # feed to the NER model to parse
65
  ner_results = nlp(example)
66
 
67
+ # we prepared a simple code to generate full entities tokens
 
68
 
69
+ modified_results = []
 
70
  for ent in ner_results:
71
+ ent["entity_group"] = int(ent["entity_group"].lower().replace("label_",""))
72
+ ent["entity_group"] = labels_list[ent["entity_group"]]
73
+
74
+ if ent["entity_group"] != "O":
75
+ if "B-" in ent["entity_group"]:
76
+ ent["entity_group"] = ent["entity_group"].replace("B-","")
77
+ modified_results.append(ent)
78
+ elif "I-" in ent["entity_group"]:
79
+ ## check related entity-group
80
+ label = ent["entity_group"].replace("I-","")
81
+ if len(modified_results) > 0 and label == modified_results[-1]["entity_group"]:
82
+ modified_results[-1]["word"] += f" {ent['word']}"
83
+ modified_results[-1]["score"] = sum([modified_results[-1]["score"], ent["score"]])/2
84
+ modified_results[-1]["end"] = ent["end"]
85
+
86
+ for res in modified_results:
87
+ print(res["word"], "==>", res["entity_group"])
 
 
 
 
 
 
 
 
 
88
 
89
  #####
90
+ # القوات المصرية ==> organization
91
+ # حرب السادس من أكتوبر ==> event
92
+ # الجيش الصهيوني ==> organization
93
+ # عام 1973 ==> time
94
  ####
95
 
96
  ```
 
101
 
102
  - على سيد عبد الحفيظ - إشراف
103
  - نرمين محمد عطيه
104
+ - صلاح خيرالله
105
  - احمد علي عبدربه
106
  - عمر بن عبد العزيز سليمان
107
  - محمد ابراهيم الجمال