import json import csv from translate import Translator #print("translate imported") import re def load_regex_pattern(filename): try: with open(filename,'r',encoding="utf-8") as config_file: config_data=json.load(config_file) return config_data except FileNotFoundError: print("regex file not found") return{} def translate_date(date_text): translation_dict={ "January":"يناير", "February":"فبراير", "March":"مارس", "April":"ابريل", "May":"مايو", "June":"يونيو", "July":"يوليو", "August":"أغسطس", "September":"سبتمبر", "October":"اكتوبر", "November":"نوفمبر", "ديسمبر":"December", "٠":"0", "١":"1", "٢":"2", "٣":"3", "٤":"4", "٥":"5", "٦":"6", "٧":"7", "٨":"8", "٩":"9" } #mapping digits and months into english for ar_digit,en_digit in translation_dict.items(): date_text=date_text.replace(ar_digit,en_digit) for ar_month,en_month in translation_dict.items(): date_text=date_text.replace(ar_month,en_month) return date_text def translate_text(text): translator=Translator(to_lang='en',from_lang='ar') translated_text=translator.translate(text) return translated_text def extract_and_store_info(input_file,output_csv,regex_patterns): extracted_data={pattern_name:"" for pattern_name in regex_patterns} with open(input_file,encoding="utf-8") as json_file: json_data=json.load(json_file) for pattern_name,pattern_data in regex_patterns.items(): if not extracted_data.get(pattern_name): for entry in json_data: if "Arabic text" in entry: text=entry.get("Arabic text","") if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE): extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group() if extracted_data.get(pattern_name): break #translate the company name into english if extracted_data.get("company_title"): extracted_data["company_title"]=translate_text(extracted_data["company_title"]) if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns: if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE): extracted_data["annual_pattern"]="Annual" elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE): extracted_data["half_annual_pattern"]="Half Annual" if "date_pattern" in regex_patterns: extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"]) with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file: fieldnames=["pattern_name","extracted_data"] writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames) writer.writeheader() for pattern_name ,data in extracted_data.items(): if data: writer.writerow({"pattern_name":pattern_name,"extracted_data":data}) extracted_count=sum(1 for data in extracted_data.values() if data) print(f"{extracted_count} pieces of data extracted and stored in:",output_csv) if __name__=="__main__": input_file="cache/output/basic_info_frame.json" output_csv_file='cache/output/outputregex.csv' regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json") if regex_patterns: extract_and_store_info(input_file,output_csv_file,regex_patterns) else: print("failed to load regex")