binder-sa
/

OCR-pipeline-python

Model card Files Files and versions

xet

Community

abdullah-1111 commited on Aug 7, 2025

Commit

b278f6d

verified ·

1 Parent(s): b8b0baa

Update json-CR6.py

Browse files

Files changed (1) hide show

json-CR6.py +140 -138

json-CR6.py CHANGED Viewed

@@ -1,138 +1,140 @@
-import base64
-import json
-import re
-import requests
-import os
-# مفتاح Gemini API
-API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
-# المجلد الذي يحتوي صور cr1
-cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6"
-# مجلد إخراج ملفات JSON
-output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6\cr6_json"
-# Ensure output folder exists
-os.makedirs(output_json_folder, exist_ok=True)
-# Exact same prompt
-prompt = """
-يرجى استخراج الحقول التالية من مستند السجل التجاري (CR6) بالصورة، باللغة العربية فقط:
-- الكيان التجاري
-- حالة السجل
-- مدة المنشأة
-- الرقم الوطني الموحد للمنشأة
-- رابط المتجر الإكتروني
-- رأس المال
-- المدينة
-- صندوق البريد
-- الرمز البريدي
-- هاتف
-- تاريخ اصدار السجل
-- تاريخ انتهاء السجل
-- الموقع الاكتروني
-- العنوان
-- النشاط التجاري
-أرجو إعادة النتيجة بصيغة JSON بهذه المفاتيح فقط، وإذا أي حقل غير موجود فضع قيمته null:
-{
-  "الكيان التجاري": null,
-  "حالة السجل": null,
-  "مدة المنشأة": null,
-  "الرقم الوطني الموحد للمنشأة": null,
-  "رابط المتجر الإكتروني": null,
-  "رأس المال": null,
-  "المدينة": null,
-  "صندوق البريد": null,
-  "الرمز البريدي": null,
-  "هاتف": null,
-  "تاريخ اصدار السجل": null,
-  "تاريخ انتهاء السجل": null,
-  "الموقع الاكتروني": null,
-  "العنوان": null,
-  "النشاط التجاري": null,
-  "رقم المبنى": null,
-  "الرقم الإضافي": null,
-  "رقم الوحدة": null
-}
-"""
-url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
-headers = {"Content-Type": "application/json"}
-# Iterate over all images
-for image_name in os.listdir(cr1_images_folder):
-    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
-        continue
-    image_path = os.path.join(cr1_images_folder, image_name)
-    base_name = os.path.splitext(image_name)[0]
-    output_file = os.path.join(output_json_folder, base_name + ".json")
-    # Skip if JSON already exists
-    if os.path.exists(output_file):
-        print(f"⏩ Skipped {image_name} (JSON file already exists)")
-        continue
-    # Read image and convert to base64
-    with open(image_path, "rb") as f:
-        image_b64 = base64.b64encode(f.read()).decode()
-    # Send request to Gemini API
-    data = {
-        "contents": [
-            {
-                "role": "user",
-                "parts": [
-                    {"text": prompt},
-                    {
-                        "inline_data": {
-                            "mime_type": "image/jpeg",
-                            "data": image_b64
-                        }
-                    }
-                ]
-            }
-        ]
-    }
-    try:
-        response = requests.post(url, headers=headers, json=data)
-        response.raise_for_status()
-        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
-        match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
-        if match:
-            json_text = match.group(1)
-            result = json.loads(json_text)
-            # تقسيم حقل العنوان إذا كان موجود
-            address = result.get("العنوان")
-            if address:
-                parts = address.strip().split()
-                if len(parts) == 5:
-                    result["رقم المبنى"] = parts[0]
-                    result["المدينة"] = parts[1]
-                    result["الرمز البريدي"] = parts[2]
-                    result["الرقم الإضافي"] = parts[3]
-                    result["رقم الوحدة"] = parts[4]
-                else:
-                    result["رقم المبنى"] = None
-                    result["المدينة"] = None
-                    result["الرمز البريدي"] = None
-                    result["الرقم الإضافي"] = None
-                    result["رقم الوحدة"] = None
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(result, f, ensure_ascii=False, indent=2)
-            print(f"✅ Processed: {image_name}")
-        else:
-            print(f"❌ Failed to extract JSON from: {image_name}")
-            print(response_text)
-    except Exception as e:
-        print(f"❌ Error processing image {image_name}: {e}")

+import base64
+import json
+import re
+import requests
+import os
+# مفتاح Gemini API
+API_KEY = "AIzaSyBr2-dUqHDZkk20hlWeEcpWnVVdkq9fqyE"
+# المجلد الذي يحتوي صور cr1
+cr1_images_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6"
+# مجلد إخراج ملفات JSON
+output_json_folder = r"C:\Users\ASUS\OneDrive - Binder\Desktop\test.orch\classified\CR6\cr6_json"
+# Ensure output folder exists
+os.makedirs(output_json_folder, exist_ok=True)
+# Exact same prompt
+# Exact same prompt
+prompt = """
+Please extract the following fields from the CR6 commercial registration document image, all in Arabic only:
+- الكيان التجاري
+- حالة السجل
+- مدة المنشأة
+- الرقم الوطني الموحد للمنشأة
+- رابط المتجر الإكتروني
+- رأس المال
+- المدينة
+- صندوق البريد
+- الرمز البريدي
+- هاتف
+- تاريخ اصدار السجل
+- تاريخ انتهاء السجل
+- الموقع الاكتروني
+- العنوان
+- النشاط التجاري
+Please return the result as JSON with the following exact keys, and if any field is missing, set its value to null:
+{
+  "commercial_entity": null,
+  "registry_status": null,
+  "establishment_duration": null,
+  "unified_national_number": null,
+  "online_store_link": null,
+  "capital": null,
+  "city": null,
+  "po_box": null,
+  "postal_code": null,
+  "phone": null,
+  "registry_issue_date": null,
+  "registry_expiry_date": null,
+  "website": null,
+  "address": null,
+  "business_activity": null,
+  "building_number": null,
+  "additional_number": null,
+  "unit_number": null
+}
+"""
+url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={API_KEY}"
+headers = {"Content-Type": "application/json"}
+# Iterate over all images
+for image_name in os.listdir(cr1_images_folder):
+    if not image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
+        continue
+    image_path = os.path.join(cr1_images_folder, image_name)
+    base_name = os.path.splitext(image_name)[0]
+    output_file = os.path.join(output_json_folder, base_name + ".json")
+    # Skip if JSON already exists
+    if os.path.exists(output_file):
+        print(f"⏩ Skipped {image_name} (JSON file already exists)")
+        continue
+    # Read image and convert to base64
+    with open(image_path, "rb") as f:
+        image_b64 = base64.b64encode(f.read()).decode()
+    # Send request to Gemini API
+    data = {
+        "contents": [
+            {
+                "role": "user",
+                "parts": [
+                    {"text": prompt},
+                    {
+                        "inline_data": {
+                            "mime_type": "image/jpeg",
+                            "data": image_b64
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        response_text = response.json()['candidates'][0]['content']['parts'][0]['text']
+        match = re.search(r"```json\s*(\{.*\})\s*```", response_text, re.DOTALL)
+        if match:
+            json_text = match.group(1)
+            result = json.loads(json_text)
+            # تقسيم حقل العنوان إذا كان موجود
+            address = result.get("العنوان")
+            if address:
+                parts = address.strip().split()
+                if len(parts) == 5:
+                    result["رقم المبنى"] = parts[0]
+                    result["المدينة"] = parts[1]
+                    result["الرمز البريدي"] = parts[2]
+                    result["الرقم الإضافي"] = parts[3]
+                    result["رقم الوحدة"] = parts[4]
+                else:
+                    result["رقم المبنى"] = None
+                    result["المدينة"] = None
+                    result["الرمز البريدي"] = None
+                    result["الرقم الإضافي"] = None
+                    result["رقم الوحدة"] = None
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            print(f"✅ Processed: {image_name}")
+        else:
+            print(f"❌ Failed to extract JSON from: {image_name}")
+            print(response_text)
+    except Exception as e:
+        print(f"❌ Error processing image {image_name}: {e}")