Spaces:
No application file
No application file
| import json | |
| import csv | |
| from translate import Translator | |
| #print("translate imported") | |
| import re | |
| def load_regex_pattern(filename): | |
| try: | |
| with open(filename,'r',encoding="utf-8") as config_file: | |
| config_data=json.load(config_file) | |
| return config_data | |
| except FileNotFoundError: | |
| print("regex file not found") | |
| return{} | |
| def translate_date(date_text): | |
| translation_dict={ | |
| "January":"يناير", "February":"فبراير", | |
| "March":"مارس", | |
| "April":"ابريل", | |
| "May":"مايو", | |
| "June":"يونيو", | |
| "July":"يوليو", | |
| "August":"أغسطس", | |
| "September":"سبتمبر", | |
| "October":"اكتوبر", | |
| "November":"نوفمبر", | |
| "ديسمبر":"December", | |
| "٠":"0", | |
| "١":"1", | |
| "٢":"2", | |
| "٣":"3", | |
| "٤":"4", | |
| "٥":"5", | |
| "٦":"6", | |
| "٧":"7", | |
| "٨":"8", | |
| "٩":"9" | |
| } | |
| #mapping digits and months into english | |
| for ar_digit,en_digit in translation_dict.items(): | |
| date_text=date_text.replace(ar_digit,en_digit) | |
| for ar_month,en_month in translation_dict.items(): | |
| date_text=date_text.replace(ar_month,en_month) | |
| return date_text | |
| def translate_text(text): | |
| translator=Translator(to_lang='en',from_lang='ar') | |
| translated_text=translator.translate(text) | |
| return translated_text | |
| def extract_and_store_info(input_file,output_csv,regex_patterns): | |
| extracted_data={pattern_name:"" for pattern_name in regex_patterns} | |
| with open(input_file,encoding="utf-8") as json_file: | |
| json_data=json.load(json_file) | |
| for pattern_name,pattern_data in regex_patterns.items(): | |
| if not extracted_data.get(pattern_name): | |
| for entry in json_data: | |
| if "Arabic text" in entry: | |
| text=entry.get("Arabic text","") | |
| if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE): | |
| extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group() | |
| if extracted_data.get(pattern_name): | |
| break | |
| #translate the company name into english | |
| if extracted_data.get("company_title"): | |
| extracted_data["company_title"]=translate_text(extracted_data["company_title"]) | |
| if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns: | |
| if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE): | |
| extracted_data["annual_pattern"]="Annual" | |
| elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE): | |
| extracted_data["half_annual_pattern"]="Half Annual" | |
| if "date_pattern" in regex_patterns: | |
| extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"]) | |
| with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file: | |
| fieldnames=["pattern_name","extracted_data"] | |
| writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames) | |
| writer.writeheader() | |
| for pattern_name ,data in extracted_data.items(): | |
| if data: | |
| writer.writerow({"pattern_name":pattern_name,"extracted_data":data}) | |
| extracted_count=sum(1 for data in extracted_data.values() if data) | |
| print(f"{extracted_count} pieces of data extracted and stored in:",output_csv) | |
| if __name__=="__main__": | |
| input_file="cache/output/basic_info_frame.json" | |
| output_csv_file='cache/output/outputregex.csv' | |
| regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json") | |
| if regex_patterns: | |
| extract_and_store_info(input_file,output_csv_file,regex_patterns) | |
| else: | |
| print("failed to load regex") | |