File size: 3,750 Bytes
2ae875f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json 
import csv
from translate import Translator
#print("translate imported")
import re
def load_regex_pattern(filename):
    try:
        with open(filename,'r',encoding="utf-8") as config_file:
            config_data=json.load(config_file)
            return config_data
    except FileNotFoundError:
        print("regex file not found")
        return{}
    
def translate_date(date_text):
    translation_dict={
 "January":"يناير", "February":"فبراير",
    "March":"مارس",
    "April":"ابريل",
    "May":"مايو",
    "June":"يونيو",
    "July":"يوليو",
    "August":"أغسطس",
    "September":"سبتمبر",
    "October":"اكتوبر",
    "November":"نوفمبر",
    "ديسمبر":"December",
    "٠":"0",
"١":"1",
"٢":"2",
"٣":"3",
"٤":"4",
"٥":"5",
"٦":"6",
"٧":"7",
"٨":"8",
"٩":"9"

}
    #mapping digits and months into english
    for ar_digit,en_digit in translation_dict.items():
        date_text=date_text.replace(ar_digit,en_digit)

    for ar_month,en_month in translation_dict.items():
        date_text=date_text.replace(ar_month,en_month)
    return date_text

def translate_text(text):
    translator=Translator(to_lang='en',from_lang='ar')
    translated_text=translator.translate(text)
    return translated_text

def extract_and_store_info(input_file,output_csv,regex_patterns):
    extracted_data={pattern_name:"" for pattern_name in regex_patterns}
    with open(input_file,encoding="utf-8") as json_file:
        json_data=json.load(json_file)

    for pattern_name,pattern_data in regex_patterns.items():
        if not extracted_data.get(pattern_name):
            for entry in json_data:
                if "Arabic text" in entry:
                    text=entry.get("Arabic text","")
                    if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE):
                        extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group()

                    if extracted_data.get(pattern_name):
                        break


#translate the company name into english 
    if extracted_data.get("company_title"):
        extracted_data["company_title"]=translate_text(extracted_data["company_title"])

    if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns:
        if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE):
            extracted_data["annual_pattern"]="Annual"
        elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE):
            extracted_data["half_annual_pattern"]="Half Annual"

    if "date_pattern" in regex_patterns:
        extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"])

    with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file:
        fieldnames=["pattern_name","extracted_data"]
        writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames)
        writer.writeheader()
        for pattern_name ,data in extracted_data.items():
            if data:
                writer.writerow({"pattern_name":pattern_name,"extracted_data":data})
    extracted_count=sum(1 for data in extracted_data.values() if data)
    print(f"{extracted_count} pieces of data extracted and stored in:",output_csv)


if __name__=="__main__":
    input_file="cache/output/basic_info_frame.json"
    output_csv_file='cache/output/outputregex.csv'
    regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json")
    if regex_patterns:
        extract_and_store_info(input_file,output_csv_file,regex_patterns)
    else:
        print("failed to load regex")