usmanabbasi
commited on
Commit
•
60a5e14
1
Parent(s):
4b81d29
Update DataExtraction.py
Browse files- DataExtraction.py +75 -75
DataExtraction.py
CHANGED
@@ -1,75 +1,75 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from PIL import Image
|
3 |
-
import numpy as np
|
4 |
-
import easyocr
|
5 |
-
import pandas as pd
|
6 |
-
import base64
|
7 |
-
import re
|
8 |
-
from datetime import datetime, timedelta
|
9 |
-
|
10 |
-
def process_image(image):
|
11 |
-
reader = easyocr.Reader(['en'], gpu=False)
|
12 |
-
img_np = np.array(image)
|
13 |
-
result = reader.readtext(img_np)
|
14 |
-
|
15 |
-
extracted_data = {
|
16 |
-
"Name": None,
|
17 |
-
"Father Name": None,
|
18 |
-
"Gender": None,
|
19 |
-
"Country of Stay": "Pakistan",
|
20 |
-
"Identity Number": None,
|
21 |
-
"Date of Birth": None,
|
22 |
-
"Date of Issue": None,
|
23 |
-
"Date of Expiry": None
|
24 |
-
}
|
25 |
-
|
26 |
-
for i, detection in enumerate(result):
|
27 |
-
text = detection[1].strip()
|
28 |
-
if "name" in text.lower() and not "father" in text.lower():
|
29 |
-
extracted_data["Name"] = result[i+1][1].strip() if i+1 < len(result) else None
|
30 |
-
elif "father" in text.lower():
|
31 |
-
extracted_data["Father Name"] = result[i+1][1].strip() if i+1 < len(result) else None
|
32 |
-
elif text.lower() in ["m", "f"]:
|
33 |
-
extracted_data["Gender"] = text.upper()
|
34 |
-
elif re.match(r'\d{5}-\d{7}-\d', text):
|
35 |
-
extracted_data["Identity Number"] = text
|
36 |
-
elif re.match(r'\d{2}\.\d{2}\.\d{4}', text):
|
37 |
-
if extracted_data["Date of Birth"] is None:
|
38 |
-
extracted_data["Date of Birth"] = text
|
39 |
-
elif extracted_data["Date of Issue"] is None:
|
40 |
-
extracted_data["Date of Issue"] = text
|
41 |
-
|
42 |
-
if extracted_data["Date of Issue"] and not extracted_data["Date of Expiry"]:
|
43 |
-
try:
|
44 |
-
date_of_issue = datetime.strptime(extracted_data["Date of Issue"], "%d.%m.%Y")
|
45 |
-
date_of_expiry = date_of_issue.replace(year=date_of_issue.year + 10)
|
46 |
-
extracted_data["Date of Expiry"] = date_of_expiry.strftime("%d.%m.%Y")
|
47 |
-
except ValueError:
|
48 |
-
pass
|
49 |
-
|
50 |
-
return extracted_data
|
51 |
-
|
52 |
-
def display_table(extracted_data):
|
53 |
-
fields = ["Name", "Father Name", "Gender", "Country of Stay", "Identity Number", "Date of Birth", "Date of Issue", "Date of Expiry"]
|
54 |
-
values = [extracted_data[field] if extracted_data[field] else "" for field in fields]
|
55 |
-
df = pd.DataFrame(list(zip(fields, values)), columns=['Field', 'Value'])
|
56 |
-
st.dataframe(df)
|
57 |
-
|
58 |
-
def get_csv_download_link(df):
|
59 |
-
csv = df.to_csv(index=False)
|
60 |
-
b64 = base64.b64encode(csv.encode()).decode()
|
61 |
-
href = f'<a href="data:file/csv;base64,{b64}" download="extracted_data.csv">Download CSV File</a>'
|
62 |
-
return href
|
63 |
-
|
64 |
-
def data_extraction_page():
|
65 |
-
st.title('ID Card Text Extraction')
|
66 |
-
|
67 |
-
uploaded_file = st.file_uploader("Upload an image of your ID card", type=["jpg", "jpeg", "png"])
|
68 |
-
|
69 |
-
if uploaded_file is not None:
|
70 |
-
image = Image.open(uploaded_file)
|
71 |
-
st.image(image, caption='
|
72 |
-
extracted_data = process_image(image)
|
73 |
-
display_table(extracted_data)
|
74 |
-
|
75 |
-
st.markdown(get_csv_download_link(pd.DataFrame(list(extracted_data.items()), columns=['Field', 'Value'])), unsafe_allow_html=True)
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
import numpy as np
|
4 |
+
import easyocr
|
5 |
+
import pandas as pd
|
6 |
+
import base64
|
7 |
+
import re
|
8 |
+
from datetime import datetime, timedelta
|
9 |
+
|
10 |
+
def process_image(image):
|
11 |
+
reader = easyocr.Reader(['en'], gpu=False)
|
12 |
+
img_np = np.array(image)
|
13 |
+
result = reader.readtext(img_np)
|
14 |
+
|
15 |
+
extracted_data = {
|
16 |
+
"Name": None,
|
17 |
+
"Father Name": None,
|
18 |
+
"Gender": None,
|
19 |
+
"Country of Stay": "Pakistan",
|
20 |
+
"Identity Number": None,
|
21 |
+
"Date of Birth": None,
|
22 |
+
"Date of Issue": None,
|
23 |
+
"Date of Expiry": None
|
24 |
+
}
|
25 |
+
|
26 |
+
for i, detection in enumerate(result):
|
27 |
+
text = detection[1].strip()
|
28 |
+
if "name" in text.lower() and not "father" in text.lower():
|
29 |
+
extracted_data["Name"] = result[i+1][1].strip() if i+1 < len(result) else None
|
30 |
+
elif "father" in text.lower():
|
31 |
+
extracted_data["Father Name"] = result[i+1][1].strip() if i+1 < len(result) else None
|
32 |
+
elif text.lower() in ["m", "f"]:
|
33 |
+
extracted_data["Gender"] = text.upper()
|
34 |
+
elif re.match(r'\d{5}-\d{7}-\d', text):
|
35 |
+
extracted_data["Identity Number"] = text
|
36 |
+
elif re.match(r'\d{2}\.\d{2}\.\d{4}', text):
|
37 |
+
if extracted_data["Date of Birth"] is None:
|
38 |
+
extracted_data["Date of Birth"] = text
|
39 |
+
elif extracted_data["Date of Issue"] is None:
|
40 |
+
extracted_data["Date of Issue"] = text
|
41 |
+
|
42 |
+
if extracted_data["Date of Issue"] and not extracted_data["Date of Expiry"]:
|
43 |
+
try:
|
44 |
+
date_of_issue = datetime.strptime(extracted_data["Date of Issue"], "%d.%m.%Y")
|
45 |
+
date_of_expiry = date_of_issue.replace(year=date_of_issue.year + 10)
|
46 |
+
extracted_data["Date of Expiry"] = date_of_expiry.strftime("%d.%m.%Y")
|
47 |
+
except ValueError:
|
48 |
+
pass
|
49 |
+
|
50 |
+
return extracted_data
|
51 |
+
|
52 |
+
def display_table(extracted_data):
|
53 |
+
fields = ["Name", "Father Name", "Gender", "Country of Stay", "Identity Number", "Date of Birth", "Date of Issue", "Date of Expiry"]
|
54 |
+
values = [extracted_data[field] if extracted_data[field] else "" for field in fields]
|
55 |
+
df = pd.DataFrame(list(zip(fields, values)), columns=['Field', 'Value'])
|
56 |
+
st.dataframe(df)
|
57 |
+
|
58 |
+
def get_csv_download_link(df):
|
59 |
+
csv = df.to_csv(index=False)
|
60 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
61 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="extracted_data.csv">Download CSV File</a>'
|
62 |
+
return href
|
63 |
+
|
64 |
+
def data_extraction_page():
|
65 |
+
st.title('ID Card Text Extraction')
|
66 |
+
|
67 |
+
uploaded_file = st.file_uploader("Upload an image of your ID card to Extract Data", type=["jpg", "jpeg", "png"])
|
68 |
+
|
69 |
+
if uploaded_file is not None:
|
70 |
+
image = Image.open(uploaded_file)
|
71 |
+
st.image(image, caption='Wait...! We Are Extracting Data For You', use_column_width=True)
|
72 |
+
extracted_data = process_image(image)
|
73 |
+
display_table(extracted_data)
|
74 |
+
|
75 |
+
st.markdown(get_csv_download_link(pd.DataFrame(list(extracted_data.items()), columns=['Field', 'Value'])), unsafe_allow_html=True)
|