project_DF / regex_format_after_OCR.py
Ahmedhisham's picture
Upload 6 files
2ae875f verified
import json
import glob
def extract_arabic_data(input_files,output_file):
arabic_data=[]
for input_file in input_files:
#load json data from the input file
with open(input_file,encoding="utf-8") as json_file:
json_data=json.load(json_file)
#extract only the arabic data
for entry in json_data:
if "lines" in entry:
for line in entry["lines"]:
text=line.get("text","")
arabic_data.append({"Arabic text":text})
with open(output_file,mode="w",encoding="utf-8") as json_output_file:
json.dump(arabic_data,json_output_file,ensure_ascii=False,indent=4)
print("Arabic data from",len(input_files),"json files has been extracted in :",output_file)
if __name__ =="__main__":
input_files=glob.glob("cache/GB/ocr_output*.json")
output_json_file="cache/output/basic_info_frame.json"
extract_arabic_data(input_files,output_json_file)