import json import glob def extract_arabic_data(input_files,output_file): arabic_data=[] for input_file in input_files: #load json data from the input file with open(input_file,encoding="utf-8") as json_file: json_data=json.load(json_file) #extract only the arabic data for entry in json_data: if "lines" in entry: for line in entry["lines"]: text=line.get("text","") arabic_data.append({"Arabic text":text}) with open(output_file,mode="w",encoding="utf-8") as json_output_file: json.dump(arabic_data,json_output_file,ensure_ascii=False,indent=4) print("Arabic data from",len(input_files),"json files has been extracted in :",output_file) if __name__ =="__main__": input_files=glob.glob("cache/GB/ocr_output*.json") output_json_file="cache/output/basic_info_frame.json" extract_arabic_data(input_files,output_json_file)