kusssbuddy / app.py
marcelhuber's picture
Update app.py
f0e873d
import gradio as gr
import pandas as pd
import tabula
from PyPDF2 import PdfReader
import re
import numpy as np
import os
import tempfile
import shutil
def pdf_processing(pdf_file):
def extract_tables_from_pdf(pdf_file):
# Open the PDF file in read-binary mode
with open(pdf_file.name, 'rb') as file:
# Create a PDF reader object
pdf_reader = PdfReader(file)
# Initialize a list to store the extracted tables
tables = []
# Iterate over each page in the PDF
for page_number in range(len(pdf_reader.pages)):
# Extract the page as a DataFrame using tabula-py
df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True)
# Append the extracted DataFrame to the tables list
tables.append(df)
return tables
def extract_substring(input_string):
pattern = r'(\d{3})\.[\w]{3}'
match = re.search(pattern, input_string)
return match.group() if match else ''
def extract_information(input_string):
# Define regex patterns for extracting information
lva_pattern = r'^(.*?)\r'
studium_pattern = r'\r(.*?)(?=\d)'
lva_nr_pattern = r'(\d{3})\.\d{3}'
pruefer_pattern = r'(?<=\d)\r(.*)$'
# Extract LVA
lva = re.search(lva_pattern, input_string, re.DOTALL)
lva = lva.group(1).strip() if lva else ''
# Extract LVA Nr.
lva_nr = extract_substring(input_string)
# Extract Studium
i = input_string.index(lva_nr)
studium = input_string[len(lva):i][1:]
# Extract Prüfer'in
j = i + len(lva_nr)
pruefer = input_string[j:]
return {
'LVA': lva,
'Studium': studium,
'LVA Nr.': lva_nr,
'Prüfer\'in': pruefer
}
# Example usage
extracted_tables = extract_tables_from_pdf(pdf_file)
# Create a new DataFrame
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
for j in range(len(extracted_tables)):
# Data to append
try:
data = extracted_tables[j][1]
except:
print('Process finished!')
# Create a DataFrame from the data
data_df = pd.DataFrame(data)#, columns=new_df.columns)
# Concatenate the new DataFrame with the data DataFrame
new_df = pd.concat([new_df, data_df], ignore_index=True)
# New header row
new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']
# Save the 0-th row
data_df.loc[-1] = data_df.columns
data_df.index = data_df.index + 1
data_df.sort_index(inplace=True)
# Assign the new header row
data_df.columns = new_header
# Define the index position to insert the new columns
insert_index = 1
# Add empty columns at the specified index
data_df.insert(insert_index, 'LVA-Nr.', '')
data_df.insert(insert_index, 'Studium', '')
data_df.insert(insert_index, 'Prüfer*in', '')
for i in range(len(data_df)):
input_string = data_df['LVA'][i]
information = extract_information(input_string)
data_df['LVA'][i] = information['LVA']
data_df['Studium'][i] = information['Studium']
data_df['LVA-Nr.'][i] = information['LVA Nr.']
data_df['Prüfer*in'][i] = information['Prüfer\'in']
final_df = pd.concat([final_df, data_df], ignore_index=True)
# Remove \r from a column
final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')
# Convert the data type of 'ECTS' column to float
final_df['ECTS'] = final_df['ECTS'].astype(float)
# Drop duplicates
final_df = final_df.drop_duplicates()
# Sum the values in 'ECTS' column
column_sum = final_df['ECTS'].sum()
# Calculate weighted mean
vector = final_df['Beurteilung']
numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
ects = list(final_df['ECTS'])
wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum
# Save the CSV file to the working directory
csv_save_path = os.path.join(os.getcwd(), "sample.csv")
final_df.to_csv(csv_save_path, index=False)
return column_sum, np.round(wm, 2), final_df, csv_save_path
# Define the Gradio interface
inputs = gr.inputs.File(label="Select PDF file", type="file")
#inputs = gr.inputs.Textbox(label="Enter the PDF file path:")
outputs = [
gr.outputs.Textbox(label="Total ECTS"),
gr.outputs.Textbox(label="Weighted Mean"),
gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
gr.outputs.File(),
]
iface = gr.Interface(
fn=pdf_processing,
inputs=inputs,
outputs=outputs,
title="KUSSS Buddy",
)
# Launch the Gradio interface
iface.launch(share=False)