File size: 4,290 Bytes
2d3bc6e
 
 
 
09f0d23
 
 
2d3bc6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7a3211
 
 
 
 
 
 
 
 
 
 
 
 
2d3bc6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7a3211
 
 
 
 
2d3bc6e
891d816
 
4efd35b
2d3bc6e
 
 
 
 
 
 
4efd35b
2d3bc6e
 
 
 
 
fc3792c
2d3bc6e
fd2307e
 
 
 
 
2d3bc6e
7381cc5
 
2d3bc6e
 
 
b7a3211
 
2d3bc6e
7381cc5
fd2307e
 
 
 
fc3792c
fd2307e
fc3792c
fd2307e
fc3792c
fd2307e
fc3792c
fd2307e
2d3bc6e
 
 
 
 
 
 
 
 
80b184f
2d3bc6e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import pandas as pd
from io import BytesIO
from helper import get_res_df
import nltk

nltk.download('wordnet')

def to_excel(df):
    """
    Convert a Pandas DataFrame to an Excel file in memory.
    
    Parameters:
    df (DataFrame): The DataFrame to be converted to Excel format.

    Returns:
    bytes: The in-memory Excel file data.
    """
    output = BytesIO()
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False)
    processed_data = output.getvalue()
    return processed_data

def validate_columns(df):
    required_columns = {
        "Employer Number", "Employer Name", "Bank Statement Reference", "Employer Province",
        "Region", "Industry", "Contributing Status", "Date Joined Plan", "Termination Date", "Email Addresses"
    }
    missing_columns = required_columns - set(df.columns)
    
    if missing_columns:
        st.error(f"Upload a valid Master Sheet. These columns are missing: {', '.join(missing_columns)}")
        return None
    else:
        return df

def process_files(excel_file, text_file):
    """
    Process the uploaded Excel/CSV and text files and return cleaned dataframes.

    Parameters:
    excel_file (UploadedFile): The uploaded Excel or CSV file.
    text_file (UploadedFile): The uploaded text file.

    Returns:
    Tuple[DataFrame, DataFrame]: A tuple containing the cleaned DataFrame from the Excel/CSV file
                                 and a DataFrame created from the text file data.
    """
    print(excel_file, text_file)  # Debugging information

    # Read the Excel/CSV file into a DataFrame
    if excel_file.name.endswith('.csv'):
        df_excel = pd.read_csv(excel_file)
    else:
        df_excel = pd.read_excel(excel_file)

    # Validate Master Sheet
    df_excel = validate_columns(df_excel)
    if df_excel is None:
        return None, None
    
    # Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
    df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
    df_excel = df_excel.dropna(subset=['Employer Name'])

    # Read and process the text file content into a list of lines
    lines = text_file.read().decode('utf-8').splitlines()
    data = [line.strip().split(',') for line in lines]  # Split each line by commas

    # Create a DataFrame from the parsed text file data
    df = pd.DataFrame(data)
    df = df[df[1].isin(['1001010071', '1001233102'])]

    return df_excel, df


# Streamlit UI section
st.title("Generate E-Batch")  # Application title

# Sidebar file uploader widgets to allow users to upload an Excel/CSV file and a text file
with st.sidebar:
    st.header("Upload Files")
    uploaded_excel = st.file_uploader("Upload the Master file (.xls or .csv)", type=["csv", "xls", "xlsx"])
    uploaded_text = st.file_uploader("Upload your Text file (.txt)", type=["txt"])

    threshold = st.slider("Adjust Threshold", min_value=0, max_value=30, value=12)

# Check if both files are uploaded
if uploaded_excel and uploaded_text:
    master_data, df = process_files(uploaded_excel, uploaded_text)  # Process the files
    if master_data is None and df is None:
        st.stop()

    res, fbdm, fben, fbbm25, nf = get_res_df(master_data, df, threshold)  # Generate the result DataFrame using the helper function
    
    # Create a layout with four columns to display the metrics
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Direct Match", fbdm)
    with col2:
        st.metric("Employer Match", fben)
    with col3:
        st.metric("AI Based Match", fbbm25)
    with col4:
        st.metric("No Clear Match", nf)

    st.write("Final Output")  # Display the result of file processing
    st.dataframe(res)  # Show the result in a table format on the web app

    # Convert the result DataFrame to an Excel file for download
    excel_data = to_excel(res)

    # Provide a button for the user to download the result as an Excel file
    st.download_button(label="Download Excel",
                       data=excel_data,
                       file_name='_Consolidated E-Batch for Banking_.xlsx',
                       mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')