Spaces:
Sleeping
Sleeping
from google.cloud import vision | |
import streamlit as st | |
from google.oauth2 import service_account | |
import os | |
import io | |
import re | |
import pandas as pd | |
from PIL import Image | |
# Provide the path to your service account key | |
service_account_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'gcv-new-project-dd6ed833cc91.json' | |
client = vision.ImageAnnotatorClient() | |
def extract_text_by_column(image_bytes, column, width): | |
""" | |
Extracts text from a specified column (left, middle, right) of an image. | |
:param image_bytes: Bytes of the image file | |
:param column: Column to extract text from (1 for left, 2 for middle, 3 for right) | |
:param width: The width of the image to calculate the bounding box | |
:return: Extracted text from the specified column | |
""" | |
# Convert bytes data to an image | |
image = Image.open(io.BytesIO(image_bytes)) | |
# Define the bounding box for the column | |
left = (column - 1) * width // 3 | |
right = column * width // 3 | |
# Crop the image to the specified column | |
column_img = image.crop((left, 0, right, image.height)) | |
# Convert the cropped image to bytes | |
img_byte_arr = io.BytesIO() | |
column_img.save(img_byte_arr, format='PNG') | |
img_byte_arr = img_byte_arr.getvalue() | |
# Perform text detection on the cropped image | |
image = vision.Image(content=img_byte_arr) | |
response = client.text_detection(image=image) | |
texts = response.text_annotations | |
# Return the first annotation (full text) | |
return texts[0].description if texts else '' | |
def normalize_and_extract(text): | |
# Define regex patterns for each piece of information | |
patterns = { | |
'Name': r"Name\s*:\s*([^\n-]+)", | |
'Relation Name': r"(Husband|Father|Mother|Other)\s*Name\s*:\s*([^\n-]+)", | |
'House Number': r"House Number\s*:\s*([^\n]+)", | |
'Age': r"Age\s*:\s*(\d+)", | |
'Gender': r"Gender\s*:\s*(Female|Male)" | |
} | |
# Normalize text to remove extraneous words and characters | |
normalized_text = re.sub(r"Photo|Available", "", text) | |
# Search the text for each pattern and extract the corresponding information | |
voter_info_list = [] | |
for entry in normalized_text.split('\n\n'): # Split entries by double newlines | |
voter_info = {} | |
for key, pattern in patterns.items(): | |
match = re.search(pattern, entry) | |
if match: | |
# Normalize all relation names to 'Relation Name' | |
if 'Relation' in key: | |
voter_info['Relation Name'] = match.group(2).strip() | |
else: | |
voter_info[key] = match.group(1).strip() | |
if voter_info: # Only add non-empty records | |
voter_info_list.append(voter_info) | |
return voter_info_list | |
st.title('Voter Information Extraction') | |
uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"]) | |
if uploaded_file is not None: | |
# To read file as bytes: | |
bytes_data = uploaded_file.getvalue() | |
image = Image.open(io.BytesIO(bytes_data)) | |
width = image.width # Get the width of the image for column division | |
# Display the uploaded image | |
st.image(image, caption='Uploaded Image.', use_column_width=True) | |
# If the user confirms to process the image | |
if st.button('Extract Text'): | |
# Extract text from each column | |
left_column_text = extract_text_by_column(bytes_data, 1, width) | |
middle_column_text = extract_text_by_column(bytes_data, 2, width) | |
right_column_text = extract_text_by_column(bytes_data, 3, width) | |
# Normalize and extract the information | |
left_voter_info = normalize_and_extract(left_column_text) | |
middle_voter_info = normalize_and_extract(middle_column_text) | |
right_voter_info = normalize_and_extract(right_column_text) | |
# Combine all column info into one dataframe | |
all_voter_info = left_voter_info + middle_voter_info + right_voter_info | |
voter_df = pd.DataFrame(all_voter_info) | |
# Display the DataFrame in the Streamlit app | |
st.dataframe(voter_df) | |
# Optional: provide download link for the data | |
st.download_button(label="Download data as CSV", data=voter_df.to_csv(index=False), file_name='voter_info.csv', mime='text/csv') |