File size: 977 Bytes
2ae875f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import json
import glob
def extract_arabic_data(input_files,output_file):
    arabic_data=[]
    for input_file in input_files:
        #load json data from the input file 
        with open(input_file,encoding="utf-8") as json_file:
            json_data=json.load(json_file)

        #extract only the arabic data 
        for entry in json_data:
            if "lines" in entry:
                for line in entry["lines"]:
                    text=line.get("text","")
                    arabic_data.append({"Arabic text":text})
    

    with open(output_file,mode="w",encoding="utf-8") as json_output_file:
        json.dump(arabic_data,json_output_file,ensure_ascii=False,indent=4)
    print("Arabic data from",len(input_files),"json files has been extracted in :",output_file)

if __name__ =="__main__":
    input_files=glob.glob("cache/GB/ocr_output*.json")
    output_json_file="cache/output/basic_info_frame.json"
    extract_arabic_data(input_files,output_json_file)