Ahmedhisham commited on
Commit
2ae875f
1 Parent(s): 19ab899

Upload 6 files

Browse files
Files changed (6) hide show
  1. ner.py +53 -0
  2. ner_camel_MSA.py +57 -0
  3. pdf_to_imgs.py +15 -0
  4. regex_extract.py +102 -0
  5. regex_format_after_OCR.py +25 -0
  6. stamp_detection.py +26 -0
ner.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+ import nltk
4
+ nltk.download('punkt')
5
+ from nltk.tokenize import word_tokenize
6
+
7
+ # Function to extract names using MAREFA NER model
8
+ def extract_arabic_names(json_data, model, tokenizer):
9
+ arabic_names = set()
10
+
11
+ for entry in json_data:
12
+ if "Arabic Text" in entry:
13
+ text = entry["Arabic Text"]
14
+ tokenized_text = tokenizer.tokenize(text)
15
+ inputs = tokenizer(text, return_tensors="pt")
16
+ outputs = model(**inputs)
17
+ predictions = outputs.logits.argmax(dim=-1)
18
+ predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]]
19
+
20
+ current_name = ""
21
+ for token, label in zip(tokenized_text, predicted_labels):
22
+ if label == "B-person":
23
+ current_name = token
24
+ elif label == "I-person":
25
+ current_name += " " + token
26
+ elif label != "O" and current_name:
27
+ arabic_names.add(current_name)
28
+ current_name = ""
29
+
30
+ if current_name:
31
+ arabic_names.add(current_name)
32
+
33
+ return arabic_names
34
+
35
+ # Load the MAREFA NER model and tokenizer
36
+ model_name = "marefa-nlp/marefa-ner"
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
39
+ basic='cache/output/basic_info_frame.json'
40
+ # Load JSON data from the file
41
+ with open(basic, "r", encoding="utf-8") as file:
42
+ json_data = json.load(file)
43
+
44
+ # Extract names from the JSON data using MAREFA model
45
+ arabic_names = extract_arabic_names(json_data, model, tokenizer)
46
+
47
+ # Print the extracted names
48
+ if arabic_names:
49
+ print("Arabic names extracted:")
50
+ for name in arabic_names:
51
+ print("Name:", name)
52
+ else:
53
+ print("No Arabic names found.")
ner_camel_MSA.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load model directly
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+ import json
4
+
5
+ #tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
6
+ #model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
7
+ #print("Model loaded successfully")
8
+
9
+ from nltk.tokenize import word_tokenize
10
+
11
+ # Function to extract names using MAREFA NER model
12
+ def extract_arabic_names(json_data, model, tokenizer):
13
+ arabic_names = set()
14
+
15
+ for entry in json_data:
16
+ if "Arabic Text" in entry:
17
+ text = entry["Arabic Text"]
18
+ tokenized_text = tokenizer.tokenize(text)
19
+ inputs = tokenizer(text, return_tensors="pt")
20
+ outputs = model(**inputs)
21
+ predictions = outputs.logits.argmax(dim=-1)
22
+ predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]]
23
+
24
+ current_name = ""
25
+ for token, label in zip(tokenized_text, predicted_labels):
26
+ if label == "B-person":
27
+ current_name = token
28
+ elif label == "I-person":
29
+ current_name += " " + token
30
+ elif label != "O" and current_name:
31
+ arabic_names.add(current_name)
32
+ current_name = ""
33
+
34
+ if current_name:
35
+ arabic_names.add(current_name)
36
+
37
+ return arabic_names
38
+
39
+ # Load the MAREFA NER model and tokenizer
40
+ model_name = "CAMeL-Lab/bert-base-arabic-camelbert-msa-ner"
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
42
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
43
+ basic='cache/output/basic_info_frame.json'
44
+ # Load JSON data from the file
45
+ with open(basic, "r", encoding="utf-8") as file:
46
+ json_data = json.load(file)
47
+
48
+ # Extract names from the JSON data using MAREFA model
49
+ arabic_names = extract_arabic_names(json_data, model, tokenizer)
50
+
51
+ # Print the extracted names
52
+ if arabic_names:
53
+ print("Arabic names extracted:")
54
+ for name in arabic_names:
55
+ print("Name:", name)
56
+ else:
57
+ print("No Arabic names found.")
pdf_to_imgs.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import os
3
+ pdf_file_path="sample/GB.pdf"
4
+ pdf=fitz.open(pdf_file_path)
5
+
6
+ save_dir="cache/GB"
7
+ os.makedirs(save_dir,exist_ok=True)
8
+ for page_num in range(len(pdf)):
9
+ page=pdf[page_num]
10
+ pix=page.get_pixmap()
11
+ image_filename=os.path.join(save_dir,f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_page_{page_num+1}.png")
12
+ pix.save(image_filename)
13
+
14
+
15
+ pdf.close()
regex_extract.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+ from translate import Translator
4
+ #print("translate imported")
5
+ import re
6
+ def load_regex_pattern(filename):
7
+ try:
8
+ with open(filename,'r',encoding="utf-8") as config_file:
9
+ config_data=json.load(config_file)
10
+ return config_data
11
+ except FileNotFoundError:
12
+ print("regex file not found")
13
+ return{}
14
+
15
+ def translate_date(date_text):
16
+ translation_dict={
17
+ "January":"يناير", "February":"فبراير",
18
+ "March":"مارس",
19
+ "April":"ابريل",
20
+ "May":"مايو",
21
+ "June":"يونيو",
22
+ "July":"يوليو",
23
+ "August":"أغسطس",
24
+ "September":"سبتمبر",
25
+ "October":"اكتوبر",
26
+ "November":"نوفمبر",
27
+ "ديسمبر":"December",
28
+ "٠":"0",
29
+ "١":"1",
30
+ "٢":"2",
31
+ "٣":"3",
32
+ "٤":"4",
33
+ "٥":"5",
34
+ "٦":"6",
35
+ "٧":"7",
36
+ "٨":"8",
37
+ "٩":"9"
38
+
39
+ }
40
+ #mapping digits and months into english
41
+ for ar_digit,en_digit in translation_dict.items():
42
+ date_text=date_text.replace(ar_digit,en_digit)
43
+
44
+ for ar_month,en_month in translation_dict.items():
45
+ date_text=date_text.replace(ar_month,en_month)
46
+ return date_text
47
+
48
+ def translate_text(text):
49
+ translator=Translator(to_lang='en',from_lang='ar')
50
+ translated_text=translator.translate(text)
51
+ return translated_text
52
+
53
+ def extract_and_store_info(input_file,output_csv,regex_patterns):
54
+ extracted_data={pattern_name:"" for pattern_name in regex_patterns}
55
+ with open(input_file,encoding="utf-8") as json_file:
56
+ json_data=json.load(json_file)
57
+
58
+ for pattern_name,pattern_data in regex_patterns.items():
59
+ if not extracted_data.get(pattern_name):
60
+ for entry in json_data:
61
+ if "Arabic text" in entry:
62
+ text=entry.get("Arabic text","")
63
+ if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE):
64
+ extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group()
65
+
66
+ if extracted_data.get(pattern_name):
67
+ break
68
+
69
+
70
+ #translate the company name into english
71
+ if extracted_data.get("company_title"):
72
+ extracted_data["company_title"]=translate_text(extracted_data["company_title"])
73
+
74
+ if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns:
75
+ if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE):
76
+ extracted_data["annual_pattern"]="Annual"
77
+ elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE):
78
+ extracted_data["half_annual_pattern"]="Half Annual"
79
+
80
+ if "date_pattern" in regex_patterns:
81
+ extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"])
82
+
83
+ with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file:
84
+ fieldnames=["pattern_name","extracted_data"]
85
+ writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames)
86
+ writer.writeheader()
87
+ for pattern_name ,data in extracted_data.items():
88
+ if data:
89
+ writer.writerow({"pattern_name":pattern_name,"extracted_data":data})
90
+ extracted_count=sum(1 for data in extracted_data.values() if data)
91
+ print(f"{extracted_count} pieces of data extracted and stored in:",output_csv)
92
+
93
+
94
+ if __name__=="__main__":
95
+ input_file="cache/output/basic_info_frame.json"
96
+ output_csv_file='cache/output/outputregex.csv'
97
+ regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json")
98
+ if regex_patterns:
99
+ extract_and_store_info(input_file,output_csv_file,regex_patterns)
100
+ else:
101
+ print("failed to load regex")
102
+
regex_format_after_OCR.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import glob
3
+ def extract_arabic_data(input_files,output_file):
4
+ arabic_data=[]
5
+ for input_file in input_files:
6
+ #load json data from the input file
7
+ with open(input_file,encoding="utf-8") as json_file:
8
+ json_data=json.load(json_file)
9
+
10
+ #extract only the arabic data
11
+ for entry in json_data:
12
+ if "lines" in entry:
13
+ for line in entry["lines"]:
14
+ text=line.get("text","")
15
+ arabic_data.append({"Arabic text":text})
16
+
17
+
18
+ with open(output_file,mode="w",encoding="utf-8") as json_output_file:
19
+ json.dump(arabic_data,json_output_file,ensure_ascii=False,indent=4)
20
+ print("Arabic data from",len(input_files),"json files has been extracted in :",output_file)
21
+
22
+ if __name__ =="__main__":
23
+ input_files=glob.glob("cache/GB/ocr_output*.json")
24
+ output_json_file="cache/output/basic_info_frame.json"
25
+ extract_arabic_data(input_files,output_json_file)
stamp_detection.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ultralytics import YOLO
2
+ import os
3
+ import csv
4
+ model=YOLO('mainpipeline/models/stamp_detection_model.pt')
5
+ img_dir='cache/GB'
6
+ #output_csv='cache/extracted_data.csv'
7
+ #set a detection flag = false
8
+ output_csv='cache/output/appended_data.csv'
9
+ revision_status='unrevised'
10
+
11
+ for img_name in os.listdir(img_dir):
12
+ img_path=os.path.join(img_dir,img_name)
13
+ if os.path.isfile(img_path):
14
+ results=model.predict(img_path,conf=0.25,save=False)
15
+ if len(results)>0 :
16
+ revision_status="Revised"
17
+ break
18
+ with open(output_csv,mode='a',newline='') as file:
19
+ writer=csv.writer(file)
20
+ #writer.writerow(['Revision status',revision_status])
21
+ file.seek(0,os.SEEK_END)
22
+ if file.tell()==0:
23
+ writer.writerow(["Revision_status"])
24
+ writer.writerow(["revision_status",revision_status])
25
+
26
+