usmanabbasi commited on
Commit
60a5e14
1 Parent(s): 4b81d29

Update DataExtraction.py

Browse files
Files changed (1) hide show
  1. DataExtraction.py +75 -75
DataExtraction.py CHANGED
@@ -1,75 +1,75 @@
1
- import streamlit as st
2
- from PIL import Image
3
- import numpy as np
4
- import easyocr
5
- import pandas as pd
6
- import base64
7
- import re
8
- from datetime import datetime, timedelta
9
-
10
- def process_image(image):
11
- reader = easyocr.Reader(['en'], gpu=False)
12
- img_np = np.array(image)
13
- result = reader.readtext(img_np)
14
-
15
- extracted_data = {
16
- "Name": None,
17
- "Father Name": None,
18
- "Gender": None,
19
- "Country of Stay": "Pakistan",
20
- "Identity Number": None,
21
- "Date of Birth": None,
22
- "Date of Issue": None,
23
- "Date of Expiry": None
24
- }
25
-
26
- for i, detection in enumerate(result):
27
- text = detection[1].strip()
28
- if "name" in text.lower() and not "father" in text.lower():
29
- extracted_data["Name"] = result[i+1][1].strip() if i+1 < len(result) else None
30
- elif "father" in text.lower():
31
- extracted_data["Father Name"] = result[i+1][1].strip() if i+1 < len(result) else None
32
- elif text.lower() in ["m", "f"]:
33
- extracted_data["Gender"] = text.upper()
34
- elif re.match(r'\d{5}-\d{7}-\d', text):
35
- extracted_data["Identity Number"] = text
36
- elif re.match(r'\d{2}\.\d{2}\.\d{4}', text):
37
- if extracted_data["Date of Birth"] is None:
38
- extracted_data["Date of Birth"] = text
39
- elif extracted_data["Date of Issue"] is None:
40
- extracted_data["Date of Issue"] = text
41
-
42
- if extracted_data["Date of Issue"] and not extracted_data["Date of Expiry"]:
43
- try:
44
- date_of_issue = datetime.strptime(extracted_data["Date of Issue"], "%d.%m.%Y")
45
- date_of_expiry = date_of_issue.replace(year=date_of_issue.year + 10)
46
- extracted_data["Date of Expiry"] = date_of_expiry.strftime("%d.%m.%Y")
47
- except ValueError:
48
- pass
49
-
50
- return extracted_data
51
-
52
- def display_table(extracted_data):
53
- fields = ["Name", "Father Name", "Gender", "Country of Stay", "Identity Number", "Date of Birth", "Date of Issue", "Date of Expiry"]
54
- values = [extracted_data[field] if extracted_data[field] else "" for field in fields]
55
- df = pd.DataFrame(list(zip(fields, values)), columns=['Field', 'Value'])
56
- st.dataframe(df)
57
-
58
- def get_csv_download_link(df):
59
- csv = df.to_csv(index=False)
60
- b64 = base64.b64encode(csv.encode()).decode()
61
- href = f'<a href="data:file/csv;base64,{b64}" download="extracted_data.csv">Download CSV File</a>'
62
- return href
63
-
64
- def data_extraction_page():
65
- st.title('ID Card Text Extraction')
66
-
67
- uploaded_file = st.file_uploader("Upload an image of your ID card", type=["jpg", "jpeg", "png"])
68
-
69
- if uploaded_file is not None:
70
- image = Image.open(uploaded_file)
71
- st.image(image, caption='Uploaded Image', use_column_width=True)
72
- extracted_data = process_image(image)
73
- display_table(extracted_data)
74
-
75
- st.markdown(get_csv_download_link(pd.DataFrame(list(extracted_data.items()), columns=['Field', 'Value'])), unsafe_allow_html=True)
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import numpy as np
4
+ import easyocr
5
+ import pandas as pd
6
+ import base64
7
+ import re
8
+ from datetime import datetime, timedelta
9
+
10
+ def process_image(image):
11
+ reader = easyocr.Reader(['en'], gpu=False)
12
+ img_np = np.array(image)
13
+ result = reader.readtext(img_np)
14
+
15
+ extracted_data = {
16
+ "Name": None,
17
+ "Father Name": None,
18
+ "Gender": None,
19
+ "Country of Stay": "Pakistan",
20
+ "Identity Number": None,
21
+ "Date of Birth": None,
22
+ "Date of Issue": None,
23
+ "Date of Expiry": None
24
+ }
25
+
26
+ for i, detection in enumerate(result):
27
+ text = detection[1].strip()
28
+ if "name" in text.lower() and not "father" in text.lower():
29
+ extracted_data["Name"] = result[i+1][1].strip() if i+1 < len(result) else None
30
+ elif "father" in text.lower():
31
+ extracted_data["Father Name"] = result[i+1][1].strip() if i+1 < len(result) else None
32
+ elif text.lower() in ["m", "f"]:
33
+ extracted_data["Gender"] = text.upper()
34
+ elif re.match(r'\d{5}-\d{7}-\d', text):
35
+ extracted_data["Identity Number"] = text
36
+ elif re.match(r'\d{2}\.\d{2}\.\d{4}', text):
37
+ if extracted_data["Date of Birth"] is None:
38
+ extracted_data["Date of Birth"] = text
39
+ elif extracted_data["Date of Issue"] is None:
40
+ extracted_data["Date of Issue"] = text
41
+
42
+ if extracted_data["Date of Issue"] and not extracted_data["Date of Expiry"]:
43
+ try:
44
+ date_of_issue = datetime.strptime(extracted_data["Date of Issue"], "%d.%m.%Y")
45
+ date_of_expiry = date_of_issue.replace(year=date_of_issue.year + 10)
46
+ extracted_data["Date of Expiry"] = date_of_expiry.strftime("%d.%m.%Y")
47
+ except ValueError:
48
+ pass
49
+
50
+ return extracted_data
51
+
52
+ def display_table(extracted_data):
53
+ fields = ["Name", "Father Name", "Gender", "Country of Stay", "Identity Number", "Date of Birth", "Date of Issue", "Date of Expiry"]
54
+ values = [extracted_data[field] if extracted_data[field] else "" for field in fields]
55
+ df = pd.DataFrame(list(zip(fields, values)), columns=['Field', 'Value'])
56
+ st.dataframe(df)
57
+
58
+ def get_csv_download_link(df):
59
+ csv = df.to_csv(index=False)
60
+ b64 = base64.b64encode(csv.encode()).decode()
61
+ href = f'<a href="data:file/csv;base64,{b64}" download="extracted_data.csv">Download CSV File</a>'
62
+ return href
63
+
64
+ def data_extraction_page():
65
+ st.title('ID Card Text Extraction')
66
+
67
+ uploaded_file = st.file_uploader("Upload an image of your ID card to Extract Data", type=["jpg", "jpeg", "png"])
68
+
69
+ if uploaded_file is not None:
70
+ image = Image.open(uploaded_file)
71
+ st.image(image, caption='Wait...! We Are Extracting Data For You', use_column_width=True)
72
+ extracted_data = process_image(image)
73
+ display_table(extracted_data)
74
+
75
+ st.markdown(get_csv_download_link(pd.DataFrame(list(extracted_data.items()), columns=['Field', 'Value'])), unsafe_allow_html=True)