Spaces:

Ahmedhisham
/

project_DF

No application file

App Files Files Community

project_DF / ner.py

Ahmedhisham

Upload 6 files

2ae875f verified 9 months ago

raw

history blame contribute delete

1.89 kB

	import json
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	import nltk
	nltk.download('punkt')
	from nltk.tokenize import word_tokenize

	# Function to extract names using MAREFA NER model
	def extract_arabic_names(json_data, model, tokenizer):
	arabic_names = set()

	for entry in json_data:
	if "Arabic Text" in entry:
	text = entry["Arabic Text"]
	tokenized_text = tokenizer.tokenize(text)
	inputs = tokenizer(text, return_tensors="pt")
	outputs = model(**inputs)
	predictions = outputs.logits.argmax(dim=-1)
	predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]]

	current_name = ""
	for token, label in zip(tokenized_text, predicted_labels):
	if label == "B-person":
	current_name = token
	elif label == "I-person":
	current_name += " " + token
	elif label != "O" and current_name:
	arabic_names.add(current_name)
	current_name = ""

	if current_name:
	arabic_names.add(current_name)

	return arabic_names

	# Load the MAREFA NER model and tokenizer
	model_name = "marefa-nlp/marefa-ner"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForTokenClassification.from_pretrained(model_name)
	basic='cache/output/basic_info_frame.json'
	# Load JSON data from the file
	with open(basic, "r", encoding="utf-8") as file:
	json_data = json.load(file)

	# Extract names from the JSON data using MAREFA model
	arabic_names = extract_arabic_names(json_data, model, tokenizer)

	# Print the extracted names
	if arabic_names:
	print("Arabic names extracted:")
	for name in arabic_names:
	print("Name:", name)
	else:
	print("No Arabic names found.")