import gradio as gr
import pandas as pd
import tabula
from PyPDF2 import PdfReader
import re
import numpy as np
import os
import tempfile
import shutil
def pdf_processing(pdf_file):
def extract_tables_from_pdf(pdf_file):
# Open the PDF file in read-binary mode
with open(pdf_file.name, 'rb') as file:
# Create a PDF reader object
pdf_reader = PdfReader(file)
# Initialize a list to store the extracted tables
tables = []
# Iterate over each page in the PDF
for page_number in range(len(pdf_reader.pages)):
# Extract the page as a DataFrame using tabula-py
df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True)
# Append the extracted DataFrame to the tables list
return tables
def extract_substring(input_string):
pattern = r'(\d{3})\.[\w]{3}'
match = re.search(pattern, input_string)
return match.group() if match else ''
def extract_information(input_string):
# Define regex patterns for extracting information
lva_pattern = r'^(.*?)\r'
studium_pattern = r'\r(.*?)(?=\d)'
lva_nr_pattern = r'(\d{3})\.\d{3}'
pruefer_pattern = r'(?<=\d)\r(.*)$'
# Extract LVA
lva = re.search(lva_pattern, input_string, re.DOTALL)
lva = lva.group(1).strip() if lva else ''
# Extract LVA Nr.
lva_nr = extract_substring(input_string)
# Extract Studium
i = input_string.index(lva_nr)
studium = input_string[len(lva):i][1:]
# Extract Prüfer'in
j = i + len(lva_nr)
pruefer = input_string[j:]
return {
'LVA': lva,
'Studium': studium,
'LVA Nr.': lva_nr,
'Prüfer\'in': pruefer
# Example usage
extracted_tables = extract_tables_from_pdf(pdf_file)
# Create a new DataFrame
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
for j in range(len(extracted_tables)):
# Data to append
data = extracted_tables[j][1]
print('Process finished!')
# Create a DataFrame from the data
data_df = pd.DataFrame(data)#, columns=new_df.columns)
# Concatenate the new DataFrame with the data DataFrame
new_df = pd.concat([new_df, data_df], ignore_index=True)
# New header row
new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']
# Save the 0-th row
data_df.loc[-1] = data_df.columns
data_df.index = data_df.index + 1
# Assign the new header row
data_df.columns = new_header
# Define the index position to insert the new columns
insert_index = 1
# Add empty columns at the specified index
data_df.insert(insert_index, 'LVA-Nr.', '')
data_df.insert(insert_index, 'Studium', '')
data_df.insert(insert_index, 'Prüfer*in', '')
for i in range(len(data_df)):
input_string = data_df['LVA'][i]
information = extract_information(input_string)
data_df['LVA'][i] = information['LVA']
data_df['Studium'][i] = information['Studium']
data_df['LVA-Nr.'][i] = information['LVA Nr.']
data_df['Prüfer*in'][i] = information['Prüfer\'in']
final_df = pd.concat([final_df, data_df], ignore_index=True)
# Remove \r from a column
final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')
# Convert the data type of 'ECTS' column to float
final_df['ECTS'] = final_df['ECTS'].astype(float)
# Drop duplicates
final_df = final_df.drop_duplicates()
# Sum the values in 'ECTS' column
column_sum = final_df['ECTS'].sum()
# Calculate weighted mean
vector = final_df['Beurteilung']
numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
ects = list(final_df['ECTS'])
wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum
# Save the CSV file to the working directory
csv_save_path = os.path.join(os.getcwd(), "sample.csv")
final_df.to_csv(csv_save_path, index=False)
return column_sum, np.round(wm, 2), final_df, csv_save_path
# Define the Gradio interface
inputs = gr.inputs.File(label="Select PDF file", type="file")
outputs = [
gr.outputs.Textbox(label="Total ECTS"),
gr.outputs.Textbox(label="Weighted Mean"),
gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
iface = gr.Interface(
title="KUSSS Buddy",
# Launch the Gradio interface