File size: 2,130 Bytes
2ae875f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification
import json 

#tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
#model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
#print("Model loaded successfully")

from nltk.tokenize import word_tokenize

# Function to extract names using MAREFA NER model
def extract_arabic_names(json_data, model, tokenizer):
    arabic_names = set()

    for entry in json_data:
        if "Arabic Text" in entry:
            text = entry["Arabic Text"]
            tokenized_text = tokenizer.tokenize(text)
            inputs = tokenizer(text, return_tensors="pt")
            outputs = model(**inputs)
            predictions = outputs.logits.argmax(dim=-1)
            predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]]
            
            current_name = ""
            for token, label in zip(tokenized_text, predicted_labels):
                if label == "B-person":
                    current_name = token
                elif label == "I-person":
                    current_name += " " + token
                elif label != "O" and current_name:
                    arabic_names.add(current_name)
                    current_name = ""

            if current_name:
                arabic_names.add(current_name)

    return arabic_names

# Load the MAREFA NER model and tokenizer
model_name = "CAMeL-Lab/bert-base-arabic-camelbert-msa-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
basic='cache/output/basic_info_frame.json'
# Load JSON data from the file
with open(basic, "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Extract names from the JSON data using MAREFA model
arabic_names = extract_arabic_names(json_data, model, tokenizer)

# Print the extracted names
if arabic_names:
    print("Arabic names extracted:")
    for name in arabic_names:
        print("Name:", name)
else:
    print("No Arabic names found.")