File size: 4,321 Bytes
d948051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from google.cloud import vision
import streamlit as st
from google.oauth2 import service_account
import os
import io
import re
import pandas as pd
from PIL import Image

# Provide the path to your service account key
service_account_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r'gcv-new-project-dd6ed833cc91.json'

client = vision.ImageAnnotatorClient()

def extract_text_by_column(image_bytes, column, width):
    """
    Extracts text from a specified column (left, middle, right) of an image.
    
    :param image_bytes: Bytes of the image file
    :param column: Column to extract text from (1 for left, 2 for middle, 3 for right)
    :param width: The width of the image to calculate the bounding box
    :return: Extracted text from the specified column
    """
    # Convert bytes data to an image
    image = Image.open(io.BytesIO(image_bytes))

    # Define the bounding box for the column
    left = (column - 1) * width // 3
    right = column * width // 3

    # Crop the image to the specified column
    column_img = image.crop((left, 0, right, image.height))
    
    # Convert the cropped image to bytes
    img_byte_arr = io.BytesIO()
    column_img.save(img_byte_arr, format='PNG')
    img_byte_arr = img_byte_arr.getvalue()

    # Perform text detection on the cropped image
    image = vision.Image(content=img_byte_arr)
    response = client.text_detection(image=image)
    texts = response.text_annotations

    # Return the first annotation (full text)
    return texts[0].description if texts else ''


def normalize_and_extract(text):
    # Define regex patterns for each piece of information
    patterns = {
        'Name': r"Name\s*:\s*([^\n-]+)",
        'Relation Name': r"(Husband|Father|Mother|Other)\s*Name\s*:\s*([^\n-]+)",
        'House Number': r"House Number\s*:\s*([^\n]+)",
        'Age': r"Age\s*:\s*(\d+)",
        'Gender': r"Gender\s*:\s*(Female|Male)"
    }

    # Normalize text to remove extraneous words and characters
    normalized_text = re.sub(r"Photo|Available", "", text)
    
    # Search the text for each pattern and extract the corresponding information
    voter_info_list = []
    for entry in normalized_text.split('\n\n'):  # Split entries by double newlines
        voter_info = {}
        for key, pattern in patterns.items():
            match = re.search(pattern, entry)
            if match:
                # Normalize all relation names to 'Relation Name'
                if 'Relation' in key:
                    voter_info['Relation Name'] = match.group(2).strip()
                else:
                    voter_info[key] = match.group(1).strip()
        if voter_info:  # Only add non-empty records
            voter_info_list.append(voter_info)
    
    return voter_info_list

st.title('Voter Information Extraction')

uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
if uploaded_file is not None:
    # To read file as bytes:
    bytes_data = uploaded_file.getvalue()
    image = Image.open(io.BytesIO(bytes_data))
    width = image.width  # Get the width of the image for column division
    
    # Display the uploaded image
    st.image(image, caption='Uploaded Image.', use_column_width=True)
    
    # If the user confirms to process the image
    if st.button('Extract Text'):
        # Extract text from each column
        left_column_text = extract_text_by_column(bytes_data, 1, width)
        middle_column_text = extract_text_by_column(bytes_data, 2, width)
        right_column_text = extract_text_by_column(bytes_data, 3, width)

        # Normalize and extract the information
        left_voter_info = normalize_and_extract(left_column_text)
        middle_voter_info = normalize_and_extract(middle_column_text)
        right_voter_info = normalize_and_extract(right_column_text)

        # Combine all column info into one dataframe
        all_voter_info = left_voter_info + middle_voter_info + right_voter_info
        voter_df = pd.DataFrame(all_voter_info)

        # Display the DataFrame in the Streamlit app
        st.dataframe(voter_df)

        # Optional: provide download link for the data
        st.download_button(label="Download data as CSV", data=voter_df.to_csv(index=False), file_name='voter_info.csv', mime='text/csv')