from google.cloud import vision import streamlit as st from google.oauth2 import service_account import os import io import re import pandas as pd from PIL import Image # Provide the path to your service account key service_account_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'gcv-new-project-dd6ed833cc91.json' client = vision.ImageAnnotatorClient() def extract_text_by_column(image_bytes, column, width): """ Extracts text from a specified column (left, middle, right) of an image. :param image_bytes: Bytes of the image file :param column: Column to extract text from (1 for left, 2 for middle, 3 for right) :param width: The width of the image to calculate the bounding box :return: Extracted text from the specified column """ # Convert bytes data to an image image = Image.open(io.BytesIO(image_bytes)) # Define the bounding box for the column left = (column - 1) * width // 3 right = column * width // 3 # Crop the image to the specified column column_img = image.crop((left, 0, right, image.height)) # Convert the cropped image to bytes img_byte_arr = io.BytesIO() column_img.save(img_byte_arr, format='PNG') img_byte_arr = img_byte_arr.getvalue() # Perform text detection on the cropped image image = vision.Image(content=img_byte_arr) response = client.text_detection(image=image) texts = response.text_annotations # Return the first annotation (full text) return texts[0].description if texts else '' def normalize_and_extract(text): # Define regex patterns for each piece of information patterns = { 'Name': r"Name\s*:\s*([^\n-]+)", 'Relation Name': r"(Husband|Father|Mother|Other)\s*Name\s*:\s*([^\n-]+)", 'House Number': r"House Number\s*:\s*([^\n]+)", 'Age': r"Age\s*:\s*(\d+)", 'Gender': r"Gender\s*:\s*(Female|Male)" } # Normalize text to remove extraneous words and characters normalized_text = re.sub(r"Photo|Available", "", text) # Search the text for each pattern and extract the corresponding information voter_info_list = [] for entry in normalized_text.split('\n\n'): # Split entries by double newlines voter_info = {} for key, pattern in patterns.items(): match = re.search(pattern, entry) if match: # Normalize all relation names to 'Relation Name' if 'Relation' in key: voter_info['Relation Name'] = match.group(2).strip() else: voter_info[key] = match.group(1).strip() if voter_info: # Only add non-empty records voter_info_list.append(voter_info) return voter_info_list st.title('Voter Information Extraction') uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: # To read file as bytes: bytes_data = uploaded_file.getvalue() image = Image.open(io.BytesIO(bytes_data)) width = image.width # Get the width of the image for column division # Display the uploaded image st.image(image, caption='Uploaded Image.', use_column_width=True) # If the user confirms to process the image if st.button('Extract Text'): # Extract text from each column left_column_text = extract_text_by_column(bytes_data, 1, width) middle_column_text = extract_text_by_column(bytes_data, 2, width) right_column_text = extract_text_by_column(bytes_data, 3, width) # Normalize and extract the information left_voter_info = normalize_and_extract(left_column_text) middle_voter_info = normalize_and_extract(middle_column_text) right_voter_info = normalize_and_extract(right_column_text) # Combine all column info into one dataframe all_voter_info = left_voter_info + middle_voter_info + right_voter_info voter_df = pd.DataFrame(all_voter_info) # Display the DataFrame in the Streamlit app st.dataframe(voter_df) # Optional: provide download link for the data st.download_button(label="Download data as CSV", data=voter_df.to_csv(index=False), file_name='voter_info.csv', mime='text/csv')