project_DF / regex_extract.py
Ahmedhisham's picture
Upload 6 files
2ae875f verified
import json
import csv
from translate import Translator
#print("translate imported")
import re
def load_regex_pattern(filename):
try:
with open(filename,'r',encoding="utf-8") as config_file:
config_data=json.load(config_file)
return config_data
except FileNotFoundError:
print("regex file not found")
return{}
def translate_date(date_text):
translation_dict={
"January":"يناير", "February":"فبراير",
"March":"مارس",
"April":"ابريل",
"May":"مايو",
"June":"يونيو",
"July":"يوليو",
"August":"أغسطس",
"September":"سبتمبر",
"October":"اكتوبر",
"November":"نوفمبر",
"ديسمبر":"December",
"٠":"0",
"١":"1",
"٢":"2",
"٣":"3",
"٤":"4",
"٥":"5",
"٦":"6",
"٧":"7",
"٨":"8",
"٩":"9"
}
#mapping digits and months into english
for ar_digit,en_digit in translation_dict.items():
date_text=date_text.replace(ar_digit,en_digit)
for ar_month,en_month in translation_dict.items():
date_text=date_text.replace(ar_month,en_month)
return date_text
def translate_text(text):
translator=Translator(to_lang='en',from_lang='ar')
translated_text=translator.translate(text)
return translated_text
def extract_and_store_info(input_file,output_csv,regex_patterns):
extracted_data={pattern_name:"" for pattern_name in regex_patterns}
with open(input_file,encoding="utf-8") as json_file:
json_data=json.load(json_file)
for pattern_name,pattern_data in regex_patterns.items():
if not extracted_data.get(pattern_name):
for entry in json_data:
if "Arabic text" in entry:
text=entry.get("Arabic text","")
if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE):
extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group()
if extracted_data.get(pattern_name):
break
#translate the company name into english
if extracted_data.get("company_title"):
extracted_data["company_title"]=translate_text(extracted_data["company_title"])
if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns:
if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE):
extracted_data["annual_pattern"]="Annual"
elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE):
extracted_data["half_annual_pattern"]="Half Annual"
if "date_pattern" in regex_patterns:
extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"])
with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file:
fieldnames=["pattern_name","extracted_data"]
writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames)
writer.writeheader()
for pattern_name ,data in extracted_data.items():
if data:
writer.writerow({"pattern_name":pattern_name,"extracted_data":data})
extracted_count=sum(1 for data in extracted_data.values() if data)
print(f"{extracted_count} pieces of data extracted and stored in:",output_csv)
if __name__=="__main__":
input_file="cache/output/basic_info_frame.json"
output_csv_file='cache/output/outputregex.csv'
regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json")
if regex_patterns:
extract_and_store_info(input_file,output_csv_file,regex_patterns)
else:
print("failed to load regex")