Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import tabula | |
from PyPDF2 import PdfReader | |
import re | |
import numpy as np | |
import os | |
import tempfile | |
import shutil | |
def pdf_processing(pdf_file): | |
def extract_tables_from_pdf(pdf_file): | |
# Open the PDF file in read-binary mode | |
with open(pdf_file.name, 'rb') as file: | |
# Create a PDF reader object | |
pdf_reader = PdfReader(file) | |
# Initialize a list to store the extracted tables | |
tables = [] | |
# Iterate over each page in the PDF | |
for page_number in range(len(pdf_reader.pages)): | |
# Extract the page as a DataFrame using tabula-py | |
df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True) | |
# Append the extracted DataFrame to the tables list | |
tables.append(df) | |
return tables | |
def extract_substring(input_string): | |
pattern = r'(\d{3})\.[\w]{3}' | |
match = re.search(pattern, input_string) | |
return match.group() if match else '' | |
def extract_information(input_string): | |
# Define regex patterns for extracting information | |
lva_pattern = r'^(.*?)\r' | |
studium_pattern = r'\r(.*?)(?=\d)' | |
lva_nr_pattern = r'(\d{3})\.\d{3}' | |
pruefer_pattern = r'(?<=\d)\r(.*)$' | |
# Extract LVA | |
lva = re.search(lva_pattern, input_string, re.DOTALL) | |
lva = lva.group(1).strip() if lva else '' | |
# Extract LVA Nr. | |
lva_nr = extract_substring(input_string) | |
# Extract Studium | |
i = input_string.index(lva_nr) | |
studium = input_string[len(lva):i][1:] | |
# Extract Prüfer'in | |
j = i + len(lva_nr) | |
pruefer = input_string[j:] | |
return { | |
'LVA': lva, | |
'Studium': studium, | |
'LVA Nr.': lva_nr, | |
'Prüfer\'in': pruefer | |
} | |
# Example usage | |
extracted_tables = extract_tables_from_pdf(pdf_file) | |
# Create a new DataFrame | |
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']) | |
final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']) | |
for j in range(len(extracted_tables)): | |
# Data to append | |
try: | |
data = extracted_tables[j][1] | |
except: | |
print('Process finished!') | |
# Create a DataFrame from the data | |
data_df = pd.DataFrame(data)#, columns=new_df.columns) | |
# Concatenate the new DataFrame with the data DataFrame | |
new_df = pd.concat([new_df, data_df], ignore_index=True) | |
# New header row | |
new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'] | |
# Save the 0-th row | |
data_df.loc[-1] = data_df.columns | |
data_df.index = data_df.index + 1 | |
data_df.sort_index(inplace=True) | |
# Assign the new header row | |
data_df.columns = new_header | |
# Define the index position to insert the new columns | |
insert_index = 1 | |
# Add empty columns at the specified index | |
data_df.insert(insert_index, 'LVA-Nr.', '') | |
data_df.insert(insert_index, 'Studium', '') | |
data_df.insert(insert_index, 'Prüfer*in', '') | |
for i in range(len(data_df)): | |
input_string = data_df['LVA'][i] | |
information = extract_information(input_string) | |
data_df['LVA'][i] = information['LVA'] | |
data_df['Studium'][i] = information['Studium'] | |
data_df['LVA-Nr.'][i] = information['LVA Nr.'] | |
data_df['Prüfer*in'][i] = information['Prüfer\'in'] | |
final_df = pd.concat([final_df, data_df], ignore_index=True) | |
# Remove \r from a column | |
final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ') | |
# Convert the data type of 'ECTS' column to float | |
final_df['ECTS'] = final_df['ECTS'].astype(float) | |
# Drop duplicates | |
final_df = final_df.drop_duplicates() | |
# Sum the values in 'ECTS' column | |
column_sum = final_df['ECTS'].sum() | |
# Calculate weighted mean | |
vector = final_df['Beurteilung'] | |
numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector)) | |
ects = list(final_df['ECTS']) | |
wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum | |
# Save the CSV file to the working directory | |
csv_save_path = os.path.join(os.getcwd(), "sample.csv") | |
final_df.to_csv(csv_save_path, index=False) | |
return column_sum, np.round(wm, 2), final_df, csv_save_path | |
# Define the Gradio interface | |
inputs = gr.inputs.File(label="Select PDF file", type="file") | |
#inputs = gr.inputs.Textbox(label="Enter the PDF file path:") | |
outputs = [ | |
gr.outputs.Textbox(label="Total ECTS"), | |
gr.outputs.Textbox(label="Weighted Mean"), | |
gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True), | |
gr.outputs.File(), | |
] | |
iface = gr.Interface( | |
fn=pdf_processing, | |
inputs=inputs, | |
outputs=outputs, | |
title="KUSSS Buddy", | |
) | |
# Launch the Gradio interface | |
iface.launch(share=False) | |